Exemplo n.º 1
0
def ExtractThreads(message_infos):
    thread_messages = []
    for message_info in message_infos:
        try:
            thread_message = jwzthreading.make_message(message_info.headers)
        except ValueError:
            continue

        if thread_message:
            thread_message.message_info = message_info
            thread_messages.append(thread_message)

    thread_dict = jwzthreading.thread(thread_messages)

    containers = []
    for subject, container in thread_dict.items():
        # jwzthreading is too aggressive in threading by subject and will combine
        # distinct threads that happen to have the same subject. Split them up if
        # we have a dummy container that has lots of children at the first
        # level.
        if container.is_dummy() and len(container.children) >= 10:
            for child_container in container.children:
                child_container.subject = subject
                containers.append(child_container)
        else:
            container.subject = subject
            containers.append(container)

    return containers
Exemplo n.º 2
0
def ExtractThreads(message_infos):
    thread_messages = []
    for message_info in message_infos:
        try:
            thread_message = jwzthreading.make_message(message_info.headers)
        except ValueError:
            continue

        if thread_message:
            thread_message.message_info = message_info
            thread_messages.append(thread_message)

    thread_dict = jwzthreading.thread(thread_messages)

    containers = []
    for subject, container in thread_dict.items():
        # jwzthreading is too aggressive in threading by subject and will combine
        # distinct threads that happen to have the same subject. Split them up if
        # we have a dummy container that has lots of children at the first
        # level.
        if container.is_dummy() and len(container.children) >= 10:
            for child_container in container.children:
                child_container.subject = subject
                containers.append(child_container)
        else:
            container.subject = subject
            containers.append(container)

    return containers
Exemplo n.º 3
0
    def test_basic_message(self):
        msg = message_from_string("""Subject: random
Message-ID: <message1>
References: <ref1> <ref2> <ref1>
In-Reply-To: <reply>

Body.""")
        m = jwzthreading.make_message(msg)
        self.assertTrue(repr(m))
        self.assertEquals(m.subject, 'random')
        self.assertEquals(sorted(m.references), ['ref1', 'ref2', 'reply'])

        # Verify that repr() works
        repr(m)
Exemplo n.º 4
0
    def test_basic_message(self):
        msg = message_from_string("""Subject: random
Message-ID: <message1>
References: <ref1> <ref2> <ref1>
In-Reply-To: <reply>

Body.""")
        m = jwzthreading.make_message(msg)
        self.assertTrue(repr(m))
        self.assertEquals(m.subject, 'random')
        self.assertEquals(sorted(m.references),
                          ['ref1', 'ref2', 'reply'])

        # Verify that repr() works
        repr(m)
Exemplo n.º 5
0
    def thread_mails(emails):
        #print('Threading...')
        emails_for_threading = []
        for mail in emails:
            email_for_threading = jwzthreading.make_message(email.message_from_string(mail.imported_blob))
            #Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject.first_original().value
            #Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        L = threaded_emails.items()
        L.sort()
        for subj, container in L:
            jwzthreading.print_container(container, 0, True)

        def update_threading(threaded_emails, parent=None, debug=False):
            if debug:
                print "\n\nEntering update_threading() for %s mails:" % len(threaded_emails)
            for container in threaded_emails:
                if debug:
                    #jwzthreading.print_container(container)
                    print("\nProcessing:  " + repr(container.message.subject.first_original().value) + " " + repr(container.message.message_id)+ " " + repr(container.message.message.id))
                    print "container: " + (repr(container))
                    print "parent: " + repr(container.parent)
                    print "children: " + repr(container.children)



                if(container.message):
                    current_parent = container.message.message.parent
                    if(current_parent):
                        db_parent_message_id = current_parent.message_id
                    else:
                        db_parent_message_id = None

                    if parent:
                        if parent.message:
                            #jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = unicode("<"+parent.message.message_id+">")
                        else:
                            if debug:
                                print "Parent was a dummy container, we may need \
                                     to handle this case better, as we just \
                                     potentially lost sibbling relationships"
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    if debug:
                        print("Current parent from database: " + repr(db_parent_message_id))
                        print("Current parent from algorithm: " + repr(algorithm_parent_message_id))
                        print("References: " + repr(container.message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        if current_parent == None or isinstance(current_parent, Email):
                            if debug:
                                print("UPDATING PARENT for :" + repr(container.message.message.message_id))
                            new_parent = parent.message.message if algorithm_parent_message_id else None
                            if debug:
                                print repr(new_parent)
                            container.message.message.set_parent(new_parent)
                        else:
                            if debug:
                                print "Skipped reparenting:  the current parent \
                                isn't an email, the threading algorithm only \
                                considers mails"
                    update_threading(container.children, container, debug=debug)
                else:
                    if debug:
                        print "Current message ID: None, was a dummy container"
                    update_threading(container.children, parent, debug=debug)

        update_threading(threaded_emails.values(), debug=False)
Exemplo n.º 6
0
	tex = filter_text(text)
	print sorted(set(tex.split(' '))),len(set(tex.split(' ')))
	write_to_orig_html_file(sentences)
	words = nltk.wordpunct_tokenize(tex)
	locs = search_dict(words)
	write_to_mod_html_file(sentences,locs,tex)
	print sorted(report_words), len(report_words)
	

build_dic_words()


files = glob.glob('/root/bngbirds-data/bngbirds/*.eml')
msglist = []

for file in files[:10]:
	fp = open(file,'r')
	msg = email.message_from_file(fp)
	m = jwz.make_message(msg,file)
	msglist.append(m)
        fp.close()
	

subject_table = jwz.thread(msglist)
L = subject_table.items()
L.sort()
for subj, container in L:
	sent=defaultdict(lambda: 0)
	depth=0
	process_single_message(container,depth,sent)
Exemplo n.º 7
0
    def thread_mails(emails):
        # print('Threading...')
        emails_for_threading = []
        for mail in emails:
            email_for_threading = jwzthreading.make_message(
                email.message_from_string(mail.imported_blob))
            # Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject.first_original().value
            # Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        L = threaded_emails.items()
        L.sort()
        for subj, container in L:
            jwzthreading.print_container(container, 0, True)

        def update_threading(threaded_emails, parent=None, debug=False):
            if debug:
                print("\n\nEntering update_threading() for %s mails:" %
                      len(threaded_emails))
            for container in threaded_emails:
                if debug:
                    # jwzthreading.print_container(container)
                    print("\nProcessing:  " + repr(
                        container.message.subject.first_original().value) +
                          " " + repr(container.message.message_id) + " " +
                          repr(container.message.message.id))
                    print("container: " + (repr(container)))
                    print("parent: " + repr(container.parent))
                    print("children: " + repr(container.children))

                if (container.message):
                    current_parent = container.message.message.parent
                    if (current_parent):
                        db_parent_message_id = current_parent.message_id
                    else:
                        db_parent_message_id = None

                    if parent:
                        if parent.message:
                            # jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = unicode(
                                "<" + parent.message.message_id + ">")
                        else:
                            if debug:
                                print(
                                    "Parent was a dummy container, we may need \
                                     to handle this case better, as we just \
                                     potentially lost sibbling relationships")
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    if debug:
                        print("Current parent from database: " +
                              repr(db_parent_message_id))
                        print("Current parent from algorithm: " +
                              repr(algorithm_parent_message_id))
                        print("References: " +
                              repr(container.message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        if current_parent is None or isinstance(
                                current_parent, Email):
                            if debug:
                                print(
                                    "UPDATING PARENT for :" +
                                    repr(container.message.message.message_id))
                            new_parent = parent.message.message if algorithm_parent_message_id else None
                            if debug:
                                print(repr(new_parent))
                            container.message.message.set_parent(new_parent)
                        else:
                            if debug:
                                print(
                                    "Skipped reparenting:  the current parent \
                                isn't an email, the threading algorithm only \
                                considers mails")
                    update_threading(container.children,
                                     container,
                                     debug=debug)
                else:
                    if debug:
                        print(
                            "Current message ID: None, was a dummy container")
                    update_threading(container.children, parent, debug=debug)

        update_threading(threaded_emails.values(), debug=False)
Exemplo n.º 8
0
    def thread_mails(emails):
        print('Threading...')
        emails_for_threading = []
        for mail in emails:
            email_for_threading = jwzthreading.make_message(email.message_from_string(mail.full_message))
            #Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject
            #Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        L = threaded_emails.items()
        L.sort()
        for subj, container in L:
            jwzthreading.print_container(container, 0, True)
            
        def update_threading(threaded_emails, parent=None):
            

            for container in threaded_emails:
                #jwzthreading.print_container(container)
                #print (repr(container))
                
                ##print "parent: "+repr(container.parent)
                ##print "children: "+repr(container.children)
                ##print("\nProcessing:  " + repr(container.message.subject) + " " + repr(container.message.message_id))
                

                if(container.message):
                    current_parent = container.message.message.post.parent
                    if(current_parent):
                        db_parent_message_id = current_parent.content.message_id
                    else:
                        db_parent_message_id = None

                    if parent:
                        if parent.message:
                            #jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = unicode("<"+parent.message.message_id+">")
                        else:
                            # Parent was a dummy container, we may need to handle this case better
                            # we just potentially lost sibbling relationships
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    #print("Current parent from algorithm: " + repr(algorithm_parent_message_id))
                    #print("References: " + repr(container.message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        # Don't reparent if the current parent isn't an email, the threading algorithm only considers mails
                        if current_parent == None or isinstance(current_parent.content, Email):
                            #print("UPDATING PARENT for :" + repr(container.message.message.message_id))
                            new_parent = parent.message.message.post if algorithm_parent_message_id else None
                            #print repr(new_parent)
                            container.message.message.post.set_parent(new_parent)
                    if current_parent and current_parent.content.source_id != container.message.message.source_id:
                        #This is to correct past mistakes in the database, remove it once everyone ran it benoitg 2013-11-20
                        print("UPDATING PARENT, BAD ORIGINAL SOURCE" + repr(current_parent.content.source_id) + " " + repr(container.message.message.source_id))
                        new_parent = parent.message.message.post if algorithm_parent_message_id else None
                        #print repr(new_parent)
                        container.message.message.post.set_parent(new_parent)
                        
                    update_threading(container.children, container)
                else:
                    #print "Current message ID: None, was a dummy container"
                    update_threading(container.children, parent)
                
        update_threading(threaded_emails.values())
Exemplo n.º 9
0
    def thread_mails(emails):
        #log.debug('Threading...')
        emails_for_threading = []
        for mail in emails:
            blob = mail.imported_blob
            if not isinstance(blob, native_str):
                blob = blob.decode('ascii')
            email_for_threading = jwzthreading.make_message(
                email.message_from_string(blob))
            #Store our emailsubject, jwzthreading does not decode subject itself
            email_for_threading.subject = mail.subject.first_original().value
            #Store our email object pointer instead of the raw message text
            email_for_threading.message = mail
            emails_for_threading.append(email_for_threading)

        threaded_emails = jwzthreading.thread(emails_for_threading)

        # Output
        L = list(threaded_emails.items())
        L.sort()
        for subj, container in L:
            jwzthreading.print_container(container, 0, True)

        def update_threading(threaded_emails, parent=None, debug=False):
            log.debug("\n\nEntering update_threading() for %ld mails:" %
                      len(threaded_emails))
            for container in threaded_emails:
                # if debug:
                #jwzthreading.print_container(container)
                message_string = "%s %s %d " % (
                    container.message.subject, container.message.message_id,
                    container.message.message.id
                ) if container.message else "null "
                log.debug(
                    "Processing: %s container: %s parent: %s children :%s" %
                    (message_string, container, container.parent,
                     container.children))

                if (container.message):
                    current_parent = container.message.message.parent
                    if (current_parent):
                        db_parent_message_id = current_parent.message_id
                    else:
                        db_parent_message_id = None

                    if parent:
                        if parent.message:
                            #jwzthreading strips the <>, re-add them
                            algorithm_parent_message_id = u"<" + parent.message.message_id + u">"
                        else:
                            log.warn(
                                "Parent was a dummy container, we may need "
                                "to handle this case better, as we just "
                                "potentially lost sibling relationships")
                            algorithm_parent_message_id = None
                    else:
                        algorithm_parent_message_id = None
                    log.debug("Current parent from database: " +
                              repr(db_parent_message_id))
                    log.debug("Current parent from algorithm: " +
                              repr(algorithm_parent_message_id))
                    log.debug("References: " +
                              repr(container.message.references))
                    if algorithm_parent_message_id != db_parent_message_id:
                        if current_parent == None or isinstance(
                                current_parent, Email):
                            log.debug(
                                "UPDATING PARENT for :" +
                                repr(container.message.message.message_id))
                            new_parent = parent.message.message if algorithm_parent_message_id else None
                            log.debug(repr(new_parent))
                            container.message.message.set_parent(new_parent)
                        else:
                            log.debug(
                                "Skipped reparenting:  the current parent "
                                "isn't an email, the threading algorithm only "
                                "considers mails")
                    update_threading(container.children,
                                     container,
                                     debug=debug)
                else:
                    log.debug(
                        "Current message ID: None, was a dummy container")
                    update_threading(container.children, parent, debug=debug)

        update_threading(list(threaded_emails.values()), debug=False)
Exemplo n.º 10
0
            # for text in to_text_list(ctr.message.message):
            #     # print h2t.html2text(text.replace('\r\n', '\n').replace('=\n', ''))
            #     print text
        # if raw_input().strip() == 'q': return
        for c in ctr.children:
            print_container(c, depth + 1, tiddler_name)

    print('Reading input file...')

    if False:
        mbox_path = '/Volumes/ramdisk/ccmt6.mbox'
        # with open(mbox_path, 'rb') as ifile:
        # mbox = mailbox.UnixMailbox(ifile)
        mbox = mailbox.mbox(mbox_path)
        mlist = list(mbox)
        msglist = [make_message(m) for m in mlist]
    else:
        eml_path = '/tmp/ccmt-1'
        mlist = [
            mailbox.mboxMessage(open(eml).read())
            for eml in glob(pjoin(eml_path, '*.eml'))
        ]
        msglist = [make_message(m) for m in mlist]

    print('Threading...')
    subject_table = thread(msglist)

    # Output
    L = subject_table.items()
    L.sort()
    for subj, container in L: