def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism): matched_words_prev = [] unmatched_words_prev = [] # Split sentences into words. text_prev = [] for sentence_prev in unmatched_sentences_prev: for word_prev in sentence_prev.words: if (not word_prev.matched): text_prev.append(word_prev.value) unmatched_words_prev.append(word_prev) text_curr = [] for sentence_curr in unmatched_sentences_curr: splitted = Text.splitIntoWords(sentence_curr.value) text_curr.extend(splitted) sentence_curr.splitted.extend(splitted) # Edit consists of removing sentences, not adding new content. if (len(text_curr) == 0): return (matched_words_prev, False) # SPAM detection. if (possible_vandalism): density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id) if (density > WORD_DENSITY): return (matched_words_prev, possible_vandalism) else: possible_vandalism = False if (len(text_prev) == 0): for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: word_curr = Word() word_curr.author_id = revision_curr.contributor_name word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.value = word sentence_curr.words.append(word_curr) return (matched_words_prev, possible_vandalism) d = Differ() diff = list(d.compare(text_prev, text_curr)) for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: curr_matched = False pos = 0 while (pos < len(diff)): word_diff = diff[pos] if (word == word_diff[2:]): if (word_diff[0] == ' '): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.matched = True curr_matched = True sentence_curr.words.append(word_prev) matched_words_prev.append(word_prev) diff[pos] = '' pos = len(diff)+1 break elif (word_diff[0] == '-'): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.matched = True matched_words_prev.append(word_prev) diff[pos] = '' break elif (word_diff[0] == '+'): curr_matched = True word_curr = Word() word_curr.value = word word_curr.author_id = revision_curr.contributor_name word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id sentence_curr.words.append(word_curr) diff[pos] = '' pos = len(diff)+1 pos = pos + 1 if not(curr_matched): word_curr = Word() word_curr.value = word word_curr.author_id = revision_curr.contributor_name word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id sentence_curr.words.append(word_curr) return (matched_words_prev, possible_vandalism)
def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism, relation): matched_words_prev = [] unmatched_words_prev = [] global WORD_ID # Split sentences into words. text_prev = [] for sentence_prev in unmatched_sentences_prev: for word_prev in sentence_prev.words: if (not word_prev.matched): text_prev.append(word_prev.value) unmatched_words_prev.append(word_prev) text_curr = [] for sentence_curr in unmatched_sentences_curr: splitted = Text.splitIntoWords(sentence_curr.value) text_curr.extend(splitted) sentence_curr.splitted.extend(splitted) # Edit consists of removing sentences, not adding new content. if (len(text_curr) == 0): return (matched_words_prev, False) # SPAM detection. if (possible_vandalism): density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id) if (density > WORD_DENSITY): return (matched_words_prev, possible_vandalism) else: possible_vandalism = False if (len(text_prev) == 0): for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: word_curr = Word() word_curr.internal_id = WORD_ID word_curr.author_id = revision_curr.contributor_id word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.value = word sentence_curr.words.append(word_curr) word_curr.used.append(revision_curr.wikipedia_id) relation.added = relation.added + 1 WORD_ID = WORD_ID + 1 return (matched_words_prev, possible_vandalism) d = Differ() diff = list(d.compare(text_prev, text_curr)) for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: curr_matched = False pos = 0 while (pos < len(diff)): word_diff = diff[pos] if (word == word_diff[2:]): if (word_diff[0] == ' '): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.used.append( revision_curr.wikipedia_id) word_prev.matched = True curr_matched = True sentence_curr.words.append(word_prev) matched_words_prev.append(word_prev) diff[pos] = '' pos = len(diff) + 1 #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) break elif (word_diff[0] == '-'): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.matched = True matched_words_prev.append(word_prev) diff[pos] = '' word_prev.deleted.append( revision_curr.wikipedia_id) if (revisions[ word_prev.revision].contributor_name != revision_curr.contributor_name): if (word_prev.revision in relation.deleted.keys()): relation.deleted.update({ word_prev.revision: relation.deleted[ word_prev.revision] + 1 }) else: relation.deleted.update( {word_prev.revision: 1}) else: if (word_prev.revision in relation.self_deleted.keys()): relation.self_deleted.update({ word_prev.revision: relation.self_deleted[ word_prev.revision] + 1 }) else: relation.self_deleted.update( {word_prev.revision: 1}) break elif (word_diff[0] == '+'): curr_matched = True word_curr = Word() word_curr.internal_id = WORD_ID word_curr.value = word word_curr.author_id = revision_curr.contributor_id word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.used.append(revision_curr.wikipedia_id) sentence_curr.words.append(word_curr) relation.added = relation.added + 1 WORD_ID = WORD_ID + 1 diff[pos] = '' pos = len(diff) + 1 pos = pos + 1 if not (curr_matched): word_curr = Word() word_curr.internal_id = WORD_ID word_curr.value = word word_curr.author_id = revision_curr.contributor_id word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.used.append(revision_curr.wikipedia_id) sentence_curr.words.append(word_curr) relation.added = relation.added + 1 WORD_ID = WORD_ID + 1 return (matched_words_prev, possible_vandalism)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append(sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr) else: paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr, revision_prev, relation): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append( revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted if (revision_prev.wikipedia_id not in word_prev.used): for elem in word_prev.deleted: #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert if (elem in revisions.keys()): if (revisions[elem]. contributor_name != revision_curr. contributor_name): if (elem in relation.revert. keys()): relation.revert.update({ elem: relation.revert[elem] + 1 }) else: relation.revert.update( {elem: 1}) else: if (elem in relation. self_revert.keys()): relation.self_revert.update( { elem: relation. self_revert[elem] + 1 }) else: relation.self_revert.update( {elem: 1}) #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision]. contributor_name != revision_curr.contributor_name ): if (word_prev.revision in relation. reintroduced.keys()): relation.reintroduced.update({ word_prev.revision: relation.reintroduced[ word_prev.revision] + 1 }) else: relation.reintroduced.update( {word_prev.revision: 1}) else: if (word_prev.revision in relation. self_reintroduced.keys()): relation.self_reintroduced.update( { word_prev.revision: relation. self_reintroduced[ word_prev.revision] + 1 }) else: relation.self_reintroduced.update( {word_prev.revision: 1}) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append( sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append( sentence_curr) else: paragraph_curr.sentences.update( {sentence_curr.hash_value: [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism, relation): matched_words_prev = [] unmatched_words_prev = [] global WORD_ID # Split sentences into words. text_prev = [] for sentence_prev in unmatched_sentences_prev: for word_prev in sentence_prev.words: if (not word_prev.matched): text_prev.append(word_prev.value) unmatched_words_prev.append(word_prev) text_curr = [] for sentence_curr in unmatched_sentences_curr: splitted = Text.splitIntoWords(sentence_curr.value) text_curr.extend(splitted) sentence_curr.splitted.extend(splitted) # Edit consists of removing sentences, not adding new content. if (len(text_curr) == 0): return (matched_words_prev, False) # SPAM detection. if (possible_vandalism): density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id) if (density > WORD_DENSITY): return (matched_words_prev, possible_vandalism) else: possible_vandalism = False if (len(text_prev) == 0): for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: word_curr = Word() word_curr.internal_id = WORD_ID word_curr.author_id = revision_curr.contributor_id word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.value = word sentence_curr.words.append(word_curr) word_curr.used.append(revision_curr.wikipedia_id) relation.added = relation.added + 1 WORD_ID = WORD_ID + 1 return (matched_words_prev, possible_vandalism) d = Differ() diff = list(d.compare(text_prev, text_curr)) for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: curr_matched = False pos = 0 while (pos < len(diff)): word_diff = diff[pos] if (word == word_diff[2:]): if (word_diff[0] == ' '): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.used.append(revision_curr.wikipedia_id) word_prev.matched = True curr_matched = True sentence_curr.words.append(word_prev) matched_words_prev.append(word_prev) diff[pos] = '' pos = len(diff)+1 #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) break elif (word_diff[0] == '-'): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.matched = True matched_words_prev.append(word_prev) diff[pos] = '' word_prev.deleted.append(revision_curr.wikipedia_id) if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name): if (word_prev.revision in relation.deleted.keys()): relation.deleted.update({word_prev.revision : relation.deleted[word_prev.revision] + 1 }) else: relation.deleted.update({word_prev.revision : 1 }) else: if (word_prev.revision in relation.self_deleted.keys()): relation.self_deleted.update({word_prev.revision : relation.self_deleted[word_prev.revision] + 1 }) else: relation.self_deleted.update({word_prev.revision : 1 }) break elif (word_diff[0] == '+'): curr_matched = True word_curr = Word() word_curr.internal_id = WORD_ID word_curr.value = word word_curr.author_id = revision_curr.contributor_id word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.used.append(revision_curr.wikipedia_id) sentence_curr.words.append(word_curr) relation.added = relation.added + 1 WORD_ID = WORD_ID + 1 diff[pos] = '' pos = len(diff)+1 pos = pos + 1 if not(curr_matched): word_curr = Word() word_curr.internal_id = WORD_ID word_curr.value = word word_curr.author_id = revision_curr.contributor_id word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.used.append(revision_curr.wikipedia_id) sentence_curr.words.append(word_curr) relation.added = relation.added + 1 WORD_ID = WORD_ID + 1 return (matched_words_prev, possible_vandalism)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr, revision_prev, relation): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append(revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted if (revision_prev.wikipedia_id not in word_prev.used): for elem in word_prev.deleted: #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert if (elem in revisions.keys()): if (revisions[elem].contributor_name != revision_curr.contributor_name): if (elem in relation.revert.keys()): relation.revert.update({elem : relation.revert[elem] + 1}) else: relation.revert.update({elem : 1}) else: if (elem in relation.self_revert.keys()): relation.self_revert.update({elem : relation.self_revert[elem] + 1}) else: relation.self_revert.update({elem : 1}) #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name): if (word_prev.revision in relation.reintroduced.keys()): relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) else: relation.reintroduced.update({word_prev.revision : 1 }) else: if (word_prev.revision in relation.self_reintroduced.keys()): relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1}) else: relation.self_reintroduced.update({word_prev.revision : 1}) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append(sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr) else: paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism): matched_words_prev = [] unmatched_words_prev = [] # Split sentences into words. text_prev = [] for sentence_prev in unmatched_sentences_prev: for word_prev in sentence_prev.words: if (not word_prev.matched): text_prev.append(word_prev.value) unmatched_words_prev.append(word_prev) text_curr = [] for sentence_curr in unmatched_sentences_curr: splitted = Text.splitIntoWords(sentence_curr.value) text_curr.extend(splitted) sentence_curr.splitted.extend(splitted) # Edit consists of removing sentences, not adding new content. if (len(text_curr) == 0): return (matched_words_prev, False) # SPAM detection. if (possible_vandalism): density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id) if (density > WORD_DENSITY): print "VANDALISM: WORD DENSITY", density return (matched_words_prev, possible_vandalism) else: possible_vandalism = False if (len(text_prev) == 0): for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: word_curr = Word() word_curr.author_id = revision_curr.contributor_name word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id word_curr.value = word sentence_curr.words.append(word_curr) return (matched_words_prev, possible_vandalism) d = Differ() diff = list(d.compare(text_prev, text_curr)) for sentence_curr in unmatched_sentences_curr: for word in sentence_curr.splitted: curr_matched = False pos = 0 while (pos < len(diff)): word_diff = diff[pos] if (word == word_diff[2:]): if (word_diff[0] == ' '): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.matched = True curr_matched = True sentence_curr.words.append(word_prev) matched_words_prev.append(word_prev) diff[pos] = '' pos = len(diff) + 1 break elif (word_diff[0] == '-'): for word_prev in unmatched_words_prev: if ((not word_prev.matched) and (word_prev.value == word)): word_prev.matched = True matched_words_prev.append(word_prev) diff[pos] = '' break elif (word_diff[0] == '+'): curr_matched = True word_curr = Word() word_curr.value = word word_curr.author_id = revision_curr.contributor_name word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id sentence_curr.words.append(word_curr) diff[pos] = '' pos = len(diff) + 1 pos = pos + 1 if not (curr_matched): word_curr = Word() word_curr.value = word word_curr.author_id = revision_curr.contributor_name word_curr.author_name = revision_curr.contributor_name word_curr.revision = revision_curr.wikipedia_id sentence_curr.words.append(word_curr) return (matched_words_prev, possible_vandalism)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append( sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append( sentence_curr) else: paragraph_curr.sentences.update( {sentence_curr.hash_value: [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)