def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr) else: revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr, relation): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append(revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted, # (and was not used in the previous revision) if (revision_prev.wikipedia_id not in word_prev.used): #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert for elem in word_prev.deleted: if (elem in revisions.keys()): if (revisions[elem].contributor_name != revision_curr.contributor_name): if (elem in relation.revert.keys()): relation.revert.update({elem : relation.revert[elem] + 1}) else: relation.revert.update({elem : 1}) else: if (elem in relation.self_revert.keys()): relation.self_revert.update({elem : relation.self_revert[elem] + 1}) else: relation.self_revert.update({elem : 1}) if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name): if (word_prev.revision in relation.reintroduced.keys()): relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) else: relation.reintroduced.update({word_prev.revision : 1 }) else: if (word_prev.revision in relation.self_reintroduced.keys()): relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1}) else: relation.self_reintroduced.update({word_prev.revision : 1}) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr) else: revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr, relation): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append( revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted, # (and was not used in the previous revision) if (revision_prev.wikipedia_id not in word_prev.used): #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert for elem in word_prev.deleted: if (elem in revisions.keys()): if (revisions[elem]. contributor_name != revision_curr. contributor_name): if (elem in relation.revert. keys()): relation.revert.update({ elem: relation.revert[elem] + 1 }) else: relation.revert.update( {elem: 1}) else: if (elem in relation. self_revert.keys()): relation.self_revert.update( { elem: relation. self_revert[elem] + 1 }) else: relation.self_revert.update( {elem: 1}) if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision]. contributor_name != revision_curr.contributor_name ): if (word_prev.revision in relation. reintroduced.keys()): relation.reintroduced.update({ word_prev.revision: relation.reintroduced[ word_prev.revision] + 1 }) else: relation.reintroduced.update( {word_prev.revision: 1}) else: if (word_prev.revision in relation. self_reintroduced.keys()): relation.self_reintroduced.update( { word_prev.revision: relation. self_reintroduced[ word_prev.revision] + 1 }) else: relation.self_reintroduced.update( {word_prev.revision: 1}) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append( paragraph_curr) else: revision_curr.paragraphs.update( {paragraph_curr.hash_value: [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append( paragraph_curr) else: revision_curr.paragraphs.update( {paragraph_curr.hash_value: [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)