예제 #1
0
파일: diff.py 프로젝트: he7d3r/revscoring
def process_operations(a, b):
    start = time.time()
    operations = [op for op in segment_matcher.diff(a, b)]
    logger.debug("diff() of {0} and {1} tokens took {2} seconds."
                 .format(len(a), len(b), time.time() - start))

    return operations, a, b
예제 #2
0
def process_operations(parent_revision_text, revision_text):
    parent_revision_text = parent_revision_text or ''
    revision_text = revision_text or ''

    a = wikitext_split.tokenize(parent_revision_text)
    b = wikitext_split.tokenize(revision_text)

    return [op for op in segment_matcher.diff(a, b)], a, b
예제 #3
0
def process_operations(a, b):
    start = time.time()
    operations = [op for op in segment_matcher.diff(a, b)]
    logger.debug("diff() of {0} and {1} tokens took {2} seconds.".format(
        len(a), len(b),
        time.time() - start))

    return operations, a, b
예제 #4
0
def process_operations(parent_revision_text, revision_text):
    parent_revision_text = parent_revision_text or ''
    revision_text = revision_text or ''

    a = wikitext_split.tokenize(parent_revision_text)
    b = wikitext_split.tokenize(revision_text)

    return [op for op in segment_matcher.diff(a, b)], a, b
예제 #5
0
 def diff(self, last, current):
     if isinstance(last, str): last = self.tokenizer.tokenize(last)
     if isinstance(current, str): current = self.tokenizer.tokenize(current)
     
     deltas_ops = segment_matcher.diff(
         last,
         current,
         segmenter = self.segmenter
     )
     operations = [Operation.from_delta_op(op, last, current)
                   for op in deltas_ops]
     
     char_diff = len("".join(current)) - len("".join(last))
     byte_diff = len(bytes("".join(current), 'utf-8', 'replace')) - \
                 len(bytes("".join(last), 'utf-8', 'replace'))
     
     return Delta(char_diff, byte_diff, operations)
예제 #6
0
from deltas.segmenters import ParagraphsSentencesAndWhitespace
from deltas import sequence_matcher, segment_matcher
import sys
sys.path.insert(0, ".")


a = ["This", "is", "a", "sentence", ".", "  ",
     "This", "isn't", "a", "sentence", "."]
b = ["This", "isn't", "a", "sentence", ".",
     "  ", "This", "is", "a", "sentence", "."]

print("Comparing:")
print(" - A: {0}".format(a))
print(" - B: {0}".format(b))

print("\n")

print("Longest common substring:")
for operation in sequence_matcher.diff(a, b):
    print("--> " + str(operation))

print("\n")
print("Segment matcher:")
for operation in segment_matcher.diff(
        a, b, segmenter=ParagraphsSentencesAndWhitespace()):
    print("--> " + str(operation))
예제 #7
0
import sys;sys.path.insert(0, ".")

from deltas import sequence_matcher, segment_matcher
from deltas.segmenters import ParagraphsSentencesAndWhitespace

a = ["This", "is", "a", "sentence", ".", "  ", "This", "isn't", "a", "sentence", "."]
b = ["This", "isn't", "a", "sentence", ".", "  ", "This", "is", "a", "sentence", "."]

print("Comparing:")
print(" - A: {0}".format(a))
print(" - B: {0}".format(b))

print("\n")

print("Longest common substring:")
for operation in sequence_matcher.diff(a,b):
    print("--> " + str(operation))

print("\n")
print("Segment matcher:")
for operation in segment_matcher.diff(a,b, segmenter=ParagraphsSentencesAndWhitespace()):
    print("--> " + str(operation))
예제 #8
0
def segment_random():
    start = time.time()
    for _ in range(25):
        operations = list(segment_matcher.diff(random1_tokens, random2_tokens))
    print("\trandom: {0}".format((time.time() - start)/25))
예제 #9
0
def segment_common():
    start = time.time()
    for _ in range(25):
        operations = list(segment_matcher.diff(common1_tokens, common2_tokens))
    print("\tcommon: {0}".format((time.time() - start)/25))
예제 #10
0
def compare_rev_regexes(current, prev, revision_diff_bool, revert):
    # we want to write a function that will find the difference between new rev and old rev

    # NO CHANGE
    #print(revision_diff_bool)
    if revision_diff_bool == 0:
        diffs = '{{EMPTYBABY}}' 

    # Edit is a revert, we don't do anything...
    elif revert == True:
        diffs = '{{EMPTYBABY}}' 

    # THERE WAS SOME CHANGE
    else:
        diffs = []

        current = tokenize_prep(current)
        prev = tokenize_prep(prev)

        # deltas 
        current_t = text_split.tokenize(current)
        prev_t = text_split.tokenize(prev)
        operations = segment_matcher.diff(prev_t,current_t)

        # structures to keep track of delta-changes
        op_names = []
        op_names_noequal = []
        op_changes = []
        op_changes_noequal = []

        # for each delta change in this revision
        for op in operations:
            # e.g. insert:  p="" c = "WPNPOV, WPNPOV" 
            c = "".join(current_t[op.b1:op.b2]).strip()
            p = "".join(prev_t[op.a1:op.a2]).strip()
            #no interest in empties, the equal [] --> [] case

            if p == "," and c == ",":
                continue

            # not empty but need to deal with commas while leaving internal commas in
            if len(c)>1:
                if c[0] ==",":
                    c = c[1:].strip()
                if c[-1]==",":
                    c = c[:-1].strip()
            if len(p)>1:
                if p[0] ==",":
                    p = p[1:].strip()
                if p[-1]==",":
                    p = p[:-1].strip()

            op_changes.append(c)
            op_names.append(op.name)
            if op.name != "equal":
                # if what gets appended is '', we know that a delete has occurred
                op_changes_noequal.append(c)
                op_names_noequal.append(op.name)
        
        #print("Number of delta operations: {}".format(len(op_names)))
        
        # now we are processing cases of diff going through the operations
        # there is just one insert OR delete somewhere
        if len(op_names_noequal) == 1 and op_names_noequal[0] == "insert":
            diffs.append(op_changes_noequal[0])

        elif len(op_names_noequal) == 1 and op_names_noequal[0] == "delete":
            diffs = diffs
        
        # there are just multiple inserts (no deletes)
        elif "delete" not in op_names_noequal and "insert" in op_names_noequal:
            for change in op_changes_noequal:
                diffs.append(change)
        
        # there are just a bunch of deletes (no inserts); continue on 
        elif "insert" not in op_names_noequal and "delete" in op_names_noequal:
            diffs = diffs

        # something more complicated is afoot: inserts AND deletes
        else:
            #comaparing the regexes in current and prev as collections
            intersection = collections.Counter(prev.split(", ")) & collections.Counter(current.split(", "))
            union = collections.Counter(prev.split(", ")) | collections.Counter(current.split(", "))
            opn_counts = collections.Counter(op_names_noequal)

            new_in_current = collections.Counter(current.split(", ")) - collections.Counter(prev.split(", "))
            
            # prev and current are completely different - this involves multiple deletes AND inserts
            # if 'equal' is not in op_names
            if "equal" not in op_names and (intersection == collections.Counter()):
                for new in current.split(", "):
                    diffs.append(new)

            # prev and current have the same contents, but different order; we assume page has been re-arranged and there is nothing to add
            # there is the possibility that the same stuff that got deleted gets added as a new thing, but this seems a little unlikely within one edit and we have to make a design choice here
            elif intersection == union:
                diffs = diffs 

            # there is some overlap in content of policy invocations of prev and current revisions
            # we must figure out the meaningful differences
            else:
                # op_names are the names of each delta op, in order e.g. ['insert','delete','equal']
                # op_names_noequal are only the inserts/deletes e.g. ['insert', 'delete']
                # op_changes are the CURRENT strings for the given segment delta, in order of op_names e.g. ['WPNPOV','','WikipediaRun, Wikipedia Run']
                # opn_counts tell us how many of each operation exist in the delta
                # new_in_current is a collection of the regexes that are new in current (not in prev)

                # one insert, one or multiple deletes
                # we only care about the inserts
                if opn_counts["insert"] == 1 and opn_counts["delete"] >= 1:
                    temp = [op_changes[i] for i in range(0,len(op_names)) if op_names == "insert"]
                    for t in  temp:
                        diffs.append(t)

                # multiple inserts, one or multiple delete
                # we only care about the inserts that didn't exist before
                # we need to make sure that the insert isn't simply something that existed before
                elif opn_counts["insert"] > 1 and opn_counts["delete"] >= 1:
                    #temp = [op_changes[i] for i in range(0,len(op_names)) if op_names == "insert"]
                    #for t in  temp:
                    #    diffs.append(t)

                    #print("new in current: {}".format(new_in_current))

                    temp = []
                    for item in new_in_current:
                        #print("item: {}".format(item))
                        #print(new_in_current[item])
                        for i in range(0,new_in_current[item]):
                            temp.append(item)
                    diffs = diffs + temp

                # cases that I can't think of; just add what exists in the new, but not the old
                else:
                    temp = []
                    for item in new_in_current:
                        for i in range(0,new_in_current[item]):
                            temp.append(item)
                    diffs = diffs + temp

    # make the diff list into a string
    # calculate the counts of the diff now

    if diffs == "{{EMPTYBABY}}":
        diff_string = ""
    else:
        diff_string = reverse_tokenize_prep(", ".join(diffs))

    #print("diff_string: {}".format(diffs))
    assert ("{{EMPTYBABY}}" not in diff_string), "EMPTYBABY placeholder detected!"
    return diff_string
def create_diff_for_revision(prev_rev_id, rev_id, db_filepath):
    # first, retrieve the text from the database
    try:
        db = get_db(db_filepath)
        
        cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (prev_rev_id,))
        result = cursor.fetchall()
        if len(result) > 1:
            # This should never happen if the database is properly constructed...
            raise ValueError("WARNING: Duplicated rev_id in database, check integrity.")
        if len(result) == 0:
            raise ValueError(f"Failed to find rev_id {rev_id} in database.")
        result = result[0]
        prev_content = result['content']
        
        cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (rev_id,))
        result = cursor.fetchall()
        if len(result) > 1:
            # This should never happen if the database is properly constructed...
            raise ValueError("WARNING: Duplicated rev_id in database, check integrity.")
        if len(result) == 0:
            raise ValueError(f"Failed to find rev_id {rev_id} in database.")
        result = result[0]
        curr_content = result['content']
    finally:
        db.close()
    
    # second, tokenize the texts
    prev_tokens = wikitext_split.tokenize(prev_content)
    curr_tokens = wikitext_split.tokenize(curr_content)
    
    # third, identify segments that were inserted and removed, tracking the tokens that were added and subtracted
    all_removed_tokens = []
    all_inserted_tokens = []
    delete_count = 0
    insert_count = 0
    for segment in segment_matcher.diff(prev_tokens, curr_tokens):
        if segment.name == 'equal':
            continue
        elif segment.name == 'delete':
            removed_tokens = prev_tokens[segment.a1:segment.a2]
            removed_tokens.insert(0, 'REMOVAL_START')
            removed_tokens.append('REMOVAL_END')
            all_removed_tokens.extend(removed_tokens)
            delete_count += 1
        elif segment.name == 'insert':
            inserted_tokens = curr_tokens[segment.b1:segment.b2]
            inserted_tokens.insert(0, 'INSERTION_START')
            inserted_tokens.append('INSERTION_END')
            all_inserted_tokens.extend(inserted_tokens)
            insert_count += 1
        else:
            raise ValueError('Substitutions are not implemented by the segment matcher.')
    content_token_count = len(curr_tokens)
    #if len(curr_tokens) >= 100000: # TODO consider avoiding writing out very long articles
    return {'prev_rev_id': prev_rev_id, 
            'rev_id': rev_id,
            'delete_count': delete_count,
            'insert_count': insert_count,
            'content_token_count': content_token_count,
            'content_tokens': curr_tokens,
            'removed_tokens': all_removed_tokens, 
            'inserted_tokens': all_inserted_tokens}
예제 #12
0
def segment_random():
    start = time.time()
    for _ in range(25):
        operations = list(segment_matcher.diff(random1_tokens, random2_tokens))
    print("\trandom: {0}".format((time.time() - start) / 25))
예제 #13
0
def segment_common():
    start = time.time()
    for _ in range(25):
        operations = list(segment_matcher.diff(common1_tokens, common2_tokens))
    print("\tcommon: {0}".format((time.time() - start) / 25))
예제 #14
0
파일: diff.py 프로젝트: aetilley/revscoring
def process_operations(a, b):
    return [op for op in segment_matcher.diff(a, b)], a, b