Python Text.splitIntoWords示例

编程语言: Python

命名空间/包名称: structures

类/类型: Text

方法/功能: splitIntoWords

hotexamples.com的示例: 8

Python Text.splitIntoWords - 已找到8个示例。这些是从开源项目中提取的最受好评的structures.Text.splitIntoWords现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

calculateHash(7)

splitIntoWords(4)

Text(2)

computeAvgWordFreq(2)

splitIntoParagraphs(2)

splitIntoSentences(2)

示例#1

显示文件

文件： Wikiwho.py 项目： skasberger/wikiwho

def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism):

    matched_words_prev = []
    unmatched_words_prev = []
    
    # Split sentences into words.
    text_prev = []
    for sentence_prev in unmatched_sentences_prev:
        for word_prev in sentence_prev.words:
            if (not word_prev.matched):
                text_prev.append(word_prev.value)
                unmatched_words_prev.append(word_prev)
        
    text_curr = []
    for sentence_curr in unmatched_sentences_curr:
        splitted = Text.splitIntoWords(sentence_curr.value)
        text_curr.extend(splitted)
        sentence_curr.splitted.extend(splitted)
    
    # Edit consists of removing sentences, not adding new content. 
    if (len(text_curr) == 0):
        return (matched_words_prev, False)
        
    # SPAM detection.
    if (possible_vandalism):

        density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id)

        if (density > WORD_DENSITY):
            return (matched_words_prev, possible_vandalism)
        else:
            possible_vandalism = False

    if (len(text_prev) == 0):        
        for sentence_curr in unmatched_sentences_curr:
            for word in sentence_curr.splitted:
                word_curr = Word()
                word_curr.author_id = revision_curr.contributor_name
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.value = word
                sentence_curr.words.append(word_curr)
                
        return (matched_words_prev, possible_vandalism)
    
    d = Differ()
    diff = list(d.compare(text_prev, text_curr))
    
    
    for sentence_curr in unmatched_sentences_curr:

        for word in sentence_curr.splitted:
            curr_matched = False
            pos = 0
                
            while (pos < len(diff)):
                
                word_diff = diff[pos]
                
                if (word == word_diff[2:]): 
                    
                    if (word_diff[0] == ' '):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched) and (word_prev.value == word)):
                                word_prev.matched = True
                                curr_matched = True
                                sentence_curr.words.append(word_prev)
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                pos = len(diff)+1
                                break
                                
                    elif (word_diff[0] == '-'):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched) and (word_prev.value == word)):
                                word_prev.matched = True
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                break
                                
                    elif (word_diff[0] == '+'):
                        curr_matched = True
                        word_curr = Word()
                        word_curr.value = word
                        word_curr.author_id = revision_curr.contributor_name
                        word_curr.author_name = revision_curr.contributor_name
                        word_curr.revision = revision_curr.wikipedia_id
                        sentence_curr.words.append(word_curr)

                        diff[pos] = ''
                        pos = len(diff)+1  
                        
                pos = pos + 1
                
            if not(curr_matched):
                word_curr = Word()
                word_curr.value = word
                word_curr.author_id = revision_curr.contributor_name
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                sentence_curr.words.append(word_curr)

    return (matched_words_prev, possible_vandalism)

示例#2

显示文件

def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev,
                            revision_curr, possible_vandalism, relation):

    matched_words_prev = []
    unmatched_words_prev = []
    global WORD_ID

    # Split sentences into words.
    text_prev = []
    for sentence_prev in unmatched_sentences_prev:
        for word_prev in sentence_prev.words:
            if (not word_prev.matched):
                text_prev.append(word_prev.value)
                unmatched_words_prev.append(word_prev)

    text_curr = []
    for sentence_curr in unmatched_sentences_curr:
        splitted = Text.splitIntoWords(sentence_curr.value)
        text_curr.extend(splitted)
        sentence_curr.splitted.extend(splitted)

    # Edit consists of removing sentences, not adding new content.
    if (len(text_curr) == 0):
        return (matched_words_prev, False)

    # SPAM detection.
    if (possible_vandalism):

        density = Text.computeAvgWordFreq(text_curr,
                                          revision_curr.wikipedia_id)

        if (density > WORD_DENSITY):
            return (matched_words_prev, possible_vandalism)
        else:
            possible_vandalism = False

    if (len(text_prev) == 0):
        for sentence_curr in unmatched_sentences_curr:
            for word in sentence_curr.splitted:
                word_curr = Word()
                word_curr.internal_id = WORD_ID
                word_curr.author_id = revision_curr.contributor_id
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.value = word
                sentence_curr.words.append(word_curr)
                word_curr.used.append(revision_curr.wikipedia_id)
                relation.added = relation.added + 1
                WORD_ID = WORD_ID + 1

        return (matched_words_prev, possible_vandalism)

    d = Differ()
    diff = list(d.compare(text_prev, text_curr))

    for sentence_curr in unmatched_sentences_curr:

        for word in sentence_curr.splitted:
            curr_matched = False
            pos = 0

            while (pos < len(diff)):

                word_diff = diff[pos]

                if (word == word_diff[2:]):

                    if (word_diff[0] == ' '):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched)
                                    and (word_prev.value == word)):
                                word_prev.used.append(
                                    revision_curr.wikipedia_id)
                                word_prev.matched = True
                                curr_matched = True
                                sentence_curr.words.append(word_prev)
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                pos = len(diff) + 1
                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })

                                break

                    elif (word_diff[0] == '-'):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched)
                                    and (word_prev.value == word)):
                                word_prev.matched = True
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                word_prev.deleted.append(
                                    revision_curr.wikipedia_id)
                                if (revisions[
                                        word_prev.revision].contributor_name !=
                                        revision_curr.contributor_name):
                                    if (word_prev.revision
                                            in relation.deleted.keys()):
                                        relation.deleted.update({
                                            word_prev.revision:
                                            relation.deleted[
                                                word_prev.revision] + 1
                                        })
                                    else:
                                        relation.deleted.update(
                                            {word_prev.revision: 1})
                                else:
                                    if (word_prev.revision
                                            in relation.self_deleted.keys()):
                                        relation.self_deleted.update({
                                            word_prev.revision:
                                            relation.self_deleted[
                                                word_prev.revision] + 1
                                        })
                                    else:
                                        relation.self_deleted.update(
                                            {word_prev.revision: 1})
                                break

                    elif (word_diff[0] == '+'):
                        curr_matched = True
                        word_curr = Word()
                        word_curr.internal_id = WORD_ID
                        word_curr.value = word
                        word_curr.author_id = revision_curr.contributor_id
                        word_curr.author_name = revision_curr.contributor_name
                        word_curr.revision = revision_curr.wikipedia_id
                        word_curr.used.append(revision_curr.wikipedia_id)
                        sentence_curr.words.append(word_curr)
                        relation.added = relation.added + 1
                        WORD_ID = WORD_ID + 1

                        diff[pos] = ''
                        pos = len(diff) + 1

                pos = pos + 1

            if not (curr_matched):
                word_curr = Word()
                word_curr.internal_id = WORD_ID
                word_curr.value = word
                word_curr.author_id = revision_curr.contributor_id
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.used.append(revision_curr.wikipedia_id)
                sentence_curr.words.append(word_curr)
                relation.added = relation.added + 1
                WORD_ID = WORD_ID + 1

    return (matched_words_prev, possible_vandalism)

示例#3

显示文件

文件： Wikiwho.py 项目： skasberger/wikiwho

def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr):
    
    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0
    

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:
        
        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:
            
            # Create the Sentence structure.                
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1
             
            
            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:
                        
                        if (not sentence_prev.matched): 
                            
                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False
                                    
                            if not(matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)
                            
                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True
                                
                                # Add the sentence information to the paragraph.
                                if (hash_curr in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                                break
                            elif (matched_all):
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)
                                   
                    if (matched_curr):
                        break
                    
                        
            # Iterate over the hash table of sentences from old revisions.    
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False
                            
                        if not(matched_one):
                                    
                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)
                        
                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                        
                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)
                            
            
            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.    
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr
                
                paragraph_curr.ordered_sentences.append(sentence_curr.hash_value)
                if (sentence_curr.hash_value in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr)
                else:
                    paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]})
                
                unmatched_sentences_curr.append(sentence_curr)
            
    
    # Identify the unmatched sentences in the previous paragraph revision.            
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)
                    
                
    return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)

示例#4

显示文件

def analyseSentencesInParagraphs(unmatched_paragraphs_curr,
                                 unmatched_paragraphs_prev, revision_curr,
                                 revision_prev, relation):

    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:

        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:

            # Create the Sentence structure.
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1

            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:

                        if (not sentence_prev.matched):

                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:

                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False

                            if not (matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)

                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True
                                    word_prev.used.append(
                                        revision_curr.wikipedia_id)

                                    #if (word_prev.revision in relation.reintroduced.keys()):
                                    #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                    #else:
                                    #    relation.reintroduced.update({word_prev.revision : 1 })

                                # Add the sentence information to the paragraph.
                                if (hash_curr
                                        in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(
                                        sentence_prev)
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({
                                        sentence_prev.hash_value:
                                        [sentence_prev]
                                    })
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                break
                            elif (matched_all):

                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)

                    if (matched_curr):
                        break

            # Iterate over the hash table of sentences from old revisions.
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False

                        if not (matched_one):

                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)

                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(
                                    revision_curr.wikipedia_id)

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    word_prev.freq.append(
                                        revision_curr.wikipedia_id)

                                # Revert: reintroducing something that somebody else deleted
                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    for elem in word_prev.deleted:
                                        #if (revision_curr.wikipedia_id == 11):
                                        #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].
                                                    contributor_name !=
                                                    revision_curr.
                                                    contributor_name):
                                                if (elem in relation.revert.
                                                        keys()):
                                                    relation.revert.update({
                                                        elem:
                                                        relation.revert[elem] +
                                                        1
                                                    })
                                                else:
                                                    relation.revert.update(
                                                        {elem: 1})
                                            else:
                                                if (elem in relation.
                                                        self_revert.keys()):
                                                    relation.self_revert.update(
                                                        {
                                                            elem:
                                                            relation.
                                                            self_revert[elem] +
                                                            1
                                                        })
                                                else:
                                                    relation.self_revert.update(
                                                        {elem: 1})
                                #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].
                                                contributor_name !=
                                                revision_curr.contributor_name
                                            ):
                                            if (word_prev.revision in relation.
                                                    reintroduced.keys()):
                                                relation.reintroduced.update({
                                                    word_prev.revision:
                                                    relation.reintroduced[
                                                        word_prev.revision] + 1
                                                })
                                            else:
                                                relation.reintroduced.update(
                                                    {word_prev.revision: 1})
                                        else:
                                            if (word_prev.revision in relation.
                                                    self_reintroduced.keys()):
                                                relation.self_reintroduced.update(
                                                    {
                                                        word_prev.revision:
                                                        relation.
                                                        self_reintroduced[
                                                            word_prev.revision]
                                                        + 1
                                                    })
                                            else:
                                                relation.self_reintroduced.update(
                                                    {word_prev.revision: 1})

                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(
                                    sentence_prev)
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({
                                    sentence_prev.hash_value: [sentence_prev]
                                })
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)

            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr

                paragraph_curr.ordered_sentences.append(
                    sentence_curr.hash_value)
                if (sentence_curr.hash_value
                        in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(
                        sentence_curr)
                else:
                    paragraph_curr.sentences.update(
                        {sentence_curr.hash_value: [sentence_curr]})

                unmatched_sentences_curr.append(sentence_curr)

    # Identify the unmatched sentences in the previous paragraph revision.
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)

    return (unmatched_sentences_curr, unmatched_sentences_prev,
            matched_sentences_prev, total_sentences)

示例#5

显示文件

文件： WikiwhoRelationships.py 项目： rmarshasatx/whovis

def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism, relation):

    matched_words_prev = []
    unmatched_words_prev = []
    global WORD_ID
    
    # Split sentences into words.
    text_prev = []
    for sentence_prev in unmatched_sentences_prev:
        for word_prev in sentence_prev.words:
            if (not word_prev.matched):
                text_prev.append(word_prev.value)
                unmatched_words_prev.append(word_prev)
        
    text_curr = []
    for sentence_curr in unmatched_sentences_curr:
        splitted = Text.splitIntoWords(sentence_curr.value)
        text_curr.extend(splitted)
        sentence_curr.splitted.extend(splitted)
    
    # Edit consists of removing sentences, not adding new content. 
    if (len(text_curr) == 0):
        return (matched_words_prev, False)
        
    # SPAM detection.
    if (possible_vandalism):

        density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id)

        if (density > WORD_DENSITY):
            return (matched_words_prev, possible_vandalism)
        else:
            possible_vandalism = False

    if (len(text_prev) == 0):        
        for sentence_curr in unmatched_sentences_curr:
            for word in sentence_curr.splitted:
                word_curr = Word()
                word_curr.internal_id = WORD_ID
                word_curr.author_id = revision_curr.contributor_id
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.value = word
                sentence_curr.words.append(word_curr)
                word_curr.used.append(revision_curr.wikipedia_id)
                relation.added = relation.added + 1
                WORD_ID = WORD_ID + 1
                
        return (matched_words_prev, possible_vandalism)
    
    d = Differ()
    diff = list(d.compare(text_prev, text_curr))
    
    
    for sentence_curr in unmatched_sentences_curr:

        for word in sentence_curr.splitted:
            curr_matched = False
            pos = 0
                
            while (pos < len(diff)):
                
                word_diff = diff[pos]
                
                if (word == word_diff[2:]): 
                    
                    if (word_diff[0] == ' '):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched) and (word_prev.value == word)):
                                word_prev.used.append(revision_curr.wikipedia_id)
                                word_prev.matched = True
                                curr_matched = True
                                sentence_curr.words.append(word_prev)
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                pos = len(diff)+1
                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })
                                    
                                break
                                
                    elif (word_diff[0] == '-'):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched) and (word_prev.value == word)):
                                word_prev.matched = True
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                word_prev.deleted.append(revision_curr.wikipedia_id)
                                if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                    if (word_prev.revision in relation.deleted.keys()):
                                        relation.deleted.update({word_prev.revision : relation.deleted[word_prev.revision] + 1 })
                                    else:
                                        relation.deleted.update({word_prev.revision : 1 })
                                else:
                                    if (word_prev.revision in relation.self_deleted.keys()):
                                        relation.self_deleted.update({word_prev.revision : relation.self_deleted[word_prev.revision] + 1 })
                                    else:
                                        relation.self_deleted.update({word_prev.revision : 1 })
                                break
                                
                    elif (word_diff[0] == '+'):
                        curr_matched = True
                        word_curr = Word()
                        word_curr.internal_id = WORD_ID
                        word_curr.value = word
                        word_curr.author_id = revision_curr.contributor_id
                        word_curr.author_name = revision_curr.contributor_name
                        word_curr.revision = revision_curr.wikipedia_id
                        word_curr.used.append(revision_curr.wikipedia_id)
                        sentence_curr.words.append(word_curr)
                        relation.added = relation.added + 1
                        WORD_ID = WORD_ID + 1

                        diff[pos] = ''
                        pos = len(diff)+1  
                        
                pos = pos + 1
                
            if not(curr_matched):
                word_curr = Word()
                word_curr.internal_id = WORD_ID
                word_curr.value = word
                word_curr.author_id = revision_curr.contributor_id
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.used.append(revision_curr.wikipedia_id)
                sentence_curr.words.append(word_curr)
                relation.added = relation.added + 1
                WORD_ID = WORD_ID + 1

    return (matched_words_prev, possible_vandalism)

示例#6

显示文件

文件： WikiwhoRelationships.py 项目： rmarshasatx/whovis

def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr, revision_prev, relation):
    
    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0
    

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:
        
        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:
            
            # Create the Sentence structure.                
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1
            
            
            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:
                        
                        if (not sentence_prev.matched): 
                            
                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False
                                    
                            if not(matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)
                                
                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True
                                    word_prev.used.append(revision_curr.wikipedia_id)
                                    
                                    #if (word_prev.revision in relation.reintroduced.keys()):
                                    #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                    #else:
                                    #    relation.reintroduced.update({word_prev.revision : 1 })
                                    
                                # Add the sentence information to the paragraph.
                                if (hash_curr in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                                break
                            elif (matched_all):
                                
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)
                                   
                    if (matched_curr):
                        break
                    
                        
            # Iterate over the hash table of sentences from old revisions.    
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False
                            
                        if not(matched_one):
                                    
                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)
                        
                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    word_prev.freq.append(revision_curr.wikipedia_id)
                                
                                # Revert: reintroducing something that somebody else deleted
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    for elem in word_prev.deleted:
                                        #if (revision_curr.wikipedia_id == 11):
                                        #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].contributor_name != revision_curr.contributor_name):
                                                if (elem in relation.revert.keys()):
                                                    relation.revert.update({elem : relation.revert[elem] + 1})
                                                else:
                                                    relation.revert.update({elem : 1})
                                            else:
                                                if (elem in relation.self_revert.keys()):
                                                    relation.self_revert.update({elem : relation.self_revert[elem] + 1})
                                                else:
                                                    relation.self_revert.update({elem : 1})
                                #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id
                                        
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                            if (word_prev.revision in relation.reintroduced.keys()):
                                                relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                            else:
                                                relation.reintroduced.update({word_prev.revision : 1 })
                                        else:
                                            if (word_prev.revision in relation.self_reintroduced.keys()):
                                                relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1})
                                            else:
                                                relation.self_reintroduced.update({word_prev.revision : 1})
                                            
                                    
                                
                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)
                            
            
            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.    
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr
                
                paragraph_curr.ordered_sentences.append(sentence_curr.hash_value)
                if (sentence_curr.hash_value in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr)
                else:
                    paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]})
                
                unmatched_sentences_curr.append(sentence_curr)
            
    
    # Identify the unmatched sentences in the previous paragraph revision.            
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)
                    
                
    return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)

示例#7

显示文件

def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev,
                            revision_curr, possible_vandalism):

    matched_words_prev = []
    unmatched_words_prev = []

    # Split sentences into words.
    text_prev = []
    for sentence_prev in unmatched_sentences_prev:
        for word_prev in sentence_prev.words:
            if (not word_prev.matched):
                text_prev.append(word_prev.value)
                unmatched_words_prev.append(word_prev)

    text_curr = []
    for sentence_curr in unmatched_sentences_curr:
        splitted = Text.splitIntoWords(sentence_curr.value)
        text_curr.extend(splitted)
        sentence_curr.splitted.extend(splitted)

    # Edit consists of removing sentences, not adding new content.
    if (len(text_curr) == 0):
        return (matched_words_prev, False)

    # SPAM detection.
    if (possible_vandalism):

        density = Text.computeAvgWordFreq(text_curr,
                                          revision_curr.wikipedia_id)

        if (density > WORD_DENSITY):
            print "VANDALISM: WORD DENSITY", density
            return (matched_words_prev, possible_vandalism)
        else:
            possible_vandalism = False

    if (len(text_prev) == 0):
        for sentence_curr in unmatched_sentences_curr:
            for word in sentence_curr.splitted:
                word_curr = Word()
                word_curr.author_id = revision_curr.contributor_name
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.value = word
                sentence_curr.words.append(word_curr)

        return (matched_words_prev, possible_vandalism)

    d = Differ()
    diff = list(d.compare(text_prev, text_curr))

    for sentence_curr in unmatched_sentences_curr:

        for word in sentence_curr.splitted:
            curr_matched = False
            pos = 0

            while (pos < len(diff)):

                word_diff = diff[pos]

                if (word == word_diff[2:]):

                    if (word_diff[0] == ' '):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched)
                                    and (word_prev.value == word)):
                                word_prev.matched = True
                                curr_matched = True
                                sentence_curr.words.append(word_prev)
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                pos = len(diff) + 1
                                break

                    elif (word_diff[0] == '-'):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched)
                                    and (word_prev.value == word)):
                                word_prev.matched = True
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                break

                    elif (word_diff[0] == '+'):
                        curr_matched = True
                        word_curr = Word()
                        word_curr.value = word
                        word_curr.author_id = revision_curr.contributor_name
                        word_curr.author_name = revision_curr.contributor_name
                        word_curr.revision = revision_curr.wikipedia_id
                        sentence_curr.words.append(word_curr)

                        diff[pos] = ''
                        pos = len(diff) + 1

                pos = pos + 1

            if not (curr_matched):
                word_curr = Word()
                word_curr.value = word
                word_curr.author_id = revision_curr.contributor_name
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                sentence_curr.words.append(word_curr)

    return (matched_words_prev, possible_vandalism)

示例#8

显示文件

def analyseSentencesInParagraphs(unmatched_paragraphs_curr,
                                 unmatched_paragraphs_prev, revision_curr):

    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:

        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:

            # Create the Sentence structure.
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1

            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:

                        if (not sentence_prev.matched):

                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False

                            if not (matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)

                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True

                                # Add the sentence information to the paragraph.
                                if (hash_curr
                                        in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(
                                        sentence_prev)
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({
                                        sentence_prev.hash_value:
                                        [sentence_prev]
                                    })
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                break
                            elif (matched_all):
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)

                    if (matched_curr):
                        break

            # Iterate over the hash table of sentences from old revisions.
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False

                        if not (matched_one):

                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)

                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(
                                    sentence_prev)
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({
                                    sentence_prev.hash_value: [sentence_prev]
                                })
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)

            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr

                paragraph_curr.ordered_sentences.append(
                    sentence_curr.hash_value)
                if (sentence_curr.hash_value
                        in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(
                        sentence_curr)
                else:
                    paragraph_curr.sentences.update(
                        {sentence_curr.hash_value: [sentence_curr]})

                unmatched_sentences_curr.append(sentence_curr)

    # Identify the unmatched sentences in the previous paragraph revision.
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)

    return (unmatched_sentences_curr, unmatched_sentences_prev,
            matched_sentences_prev, total_sentences)