def process_operations(a, b): start = time.time() operations = [op for op in segment_matcher.diff(a, b)] logger.debug("diff() of {0} and {1} tokens took {2} seconds." .format(len(a), len(b), time.time() - start)) return operations, a, b
def process_operations(parent_revision_text, revision_text): parent_revision_text = parent_revision_text or '' revision_text = revision_text or '' a = wikitext_split.tokenize(parent_revision_text) b = wikitext_split.tokenize(revision_text) return [op for op in segment_matcher.diff(a, b)], a, b
def process_operations(a, b): start = time.time() operations = [op for op in segment_matcher.diff(a, b)] logger.debug("diff() of {0} and {1} tokens took {2} seconds.".format( len(a), len(b), time.time() - start)) return operations, a, b
def diff(self, last, current): if isinstance(last, str): last = self.tokenizer.tokenize(last) if isinstance(current, str): current = self.tokenizer.tokenize(current) deltas_ops = segment_matcher.diff( last, current, segmenter = self.segmenter ) operations = [Operation.from_delta_op(op, last, current) for op in deltas_ops] char_diff = len("".join(current)) - len("".join(last)) byte_diff = len(bytes("".join(current), 'utf-8', 'replace')) - \ len(bytes("".join(last), 'utf-8', 'replace')) return Delta(char_diff, byte_diff, operations)
from deltas.segmenters import ParagraphsSentencesAndWhitespace from deltas import sequence_matcher, segment_matcher import sys sys.path.insert(0, ".") a = ["This", "is", "a", "sentence", ".", " ", "This", "isn't", "a", "sentence", "."] b = ["This", "isn't", "a", "sentence", ".", " ", "This", "is", "a", "sentence", "."] print("Comparing:") print(" - A: {0}".format(a)) print(" - B: {0}".format(b)) print("\n") print("Longest common substring:") for operation in sequence_matcher.diff(a, b): print("--> " + str(operation)) print("\n") print("Segment matcher:") for operation in segment_matcher.diff( a, b, segmenter=ParagraphsSentencesAndWhitespace()): print("--> " + str(operation))
import sys;sys.path.insert(0, ".") from deltas import sequence_matcher, segment_matcher from deltas.segmenters import ParagraphsSentencesAndWhitespace a = ["This", "is", "a", "sentence", ".", " ", "This", "isn't", "a", "sentence", "."] b = ["This", "isn't", "a", "sentence", ".", " ", "This", "is", "a", "sentence", "."] print("Comparing:") print(" - A: {0}".format(a)) print(" - B: {0}".format(b)) print("\n") print("Longest common substring:") for operation in sequence_matcher.diff(a,b): print("--> " + str(operation)) print("\n") print("Segment matcher:") for operation in segment_matcher.diff(a,b, segmenter=ParagraphsSentencesAndWhitespace()): print("--> " + str(operation))
def segment_random(): start = time.time() for _ in range(25): operations = list(segment_matcher.diff(random1_tokens, random2_tokens)) print("\trandom: {0}".format((time.time() - start)/25))
def segment_common(): start = time.time() for _ in range(25): operations = list(segment_matcher.diff(common1_tokens, common2_tokens)) print("\tcommon: {0}".format((time.time() - start)/25))
def compare_rev_regexes(current, prev, revision_diff_bool, revert): # we want to write a function that will find the difference between new rev and old rev # NO CHANGE #print(revision_diff_bool) if revision_diff_bool == 0: diffs = '{{EMPTYBABY}}' # Edit is a revert, we don't do anything... elif revert == True: diffs = '{{EMPTYBABY}}' # THERE WAS SOME CHANGE else: diffs = [] current = tokenize_prep(current) prev = tokenize_prep(prev) # deltas current_t = text_split.tokenize(current) prev_t = text_split.tokenize(prev) operations = segment_matcher.diff(prev_t,current_t) # structures to keep track of delta-changes op_names = [] op_names_noequal = [] op_changes = [] op_changes_noequal = [] # for each delta change in this revision for op in operations: # e.g. insert: p="" c = "WPNPOV, WPNPOV" c = "".join(current_t[op.b1:op.b2]).strip() p = "".join(prev_t[op.a1:op.a2]).strip() #no interest in empties, the equal [] --> [] case if p == "," and c == ",": continue # not empty but need to deal with commas while leaving internal commas in if len(c)>1: if c[0] ==",": c = c[1:].strip() if c[-1]==",": c = c[:-1].strip() if len(p)>1: if p[0] ==",": p = p[1:].strip() if p[-1]==",": p = p[:-1].strip() op_changes.append(c) op_names.append(op.name) if op.name != "equal": # if what gets appended is '', we know that a delete has occurred op_changes_noequal.append(c) op_names_noequal.append(op.name) #print("Number of delta operations: {}".format(len(op_names))) # now we are processing cases of diff going through the operations # there is just one insert OR delete somewhere if len(op_names_noequal) == 1 and op_names_noequal[0] == "insert": diffs.append(op_changes_noequal[0]) elif len(op_names_noequal) == 1 and op_names_noequal[0] == "delete": diffs = diffs # there are just multiple inserts (no deletes) elif "delete" not in op_names_noequal and "insert" in op_names_noequal: for change in op_changes_noequal: diffs.append(change) # there are just a bunch of deletes (no inserts); continue on elif "insert" not in op_names_noequal and "delete" in op_names_noequal: diffs = diffs # something more complicated is afoot: inserts AND deletes else: #comaparing the regexes in current and prev as collections intersection = collections.Counter(prev.split(", ")) & collections.Counter(current.split(", ")) union = collections.Counter(prev.split(", ")) | collections.Counter(current.split(", ")) opn_counts = collections.Counter(op_names_noequal) new_in_current = collections.Counter(current.split(", ")) - collections.Counter(prev.split(", ")) # prev and current are completely different - this involves multiple deletes AND inserts # if 'equal' is not in op_names if "equal" not in op_names and (intersection == collections.Counter()): for new in current.split(", "): diffs.append(new) # prev and current have the same contents, but different order; we assume page has been re-arranged and there is nothing to add # there is the possibility that the same stuff that got deleted gets added as a new thing, but this seems a little unlikely within one edit and we have to make a design choice here elif intersection == union: diffs = diffs # there is some overlap in content of policy invocations of prev and current revisions # we must figure out the meaningful differences else: # op_names are the names of each delta op, in order e.g. ['insert','delete','equal'] # op_names_noequal are only the inserts/deletes e.g. ['insert', 'delete'] # op_changes are the CURRENT strings for the given segment delta, in order of op_names e.g. ['WPNPOV','','WikipediaRun, Wikipedia Run'] # opn_counts tell us how many of each operation exist in the delta # new_in_current is a collection of the regexes that are new in current (not in prev) # one insert, one or multiple deletes # we only care about the inserts if opn_counts["insert"] == 1 and opn_counts["delete"] >= 1: temp = [op_changes[i] for i in range(0,len(op_names)) if op_names == "insert"] for t in temp: diffs.append(t) # multiple inserts, one or multiple delete # we only care about the inserts that didn't exist before # we need to make sure that the insert isn't simply something that existed before elif opn_counts["insert"] > 1 and opn_counts["delete"] >= 1: #temp = [op_changes[i] for i in range(0,len(op_names)) if op_names == "insert"] #for t in temp: # diffs.append(t) #print("new in current: {}".format(new_in_current)) temp = [] for item in new_in_current: #print("item: {}".format(item)) #print(new_in_current[item]) for i in range(0,new_in_current[item]): temp.append(item) diffs = diffs + temp # cases that I can't think of; just add what exists in the new, but not the old else: temp = [] for item in new_in_current: for i in range(0,new_in_current[item]): temp.append(item) diffs = diffs + temp # make the diff list into a string # calculate the counts of the diff now if diffs == "{{EMPTYBABY}}": diff_string = "" else: diff_string = reverse_tokenize_prep(", ".join(diffs)) #print("diff_string: {}".format(diffs)) assert ("{{EMPTYBABY}}" not in diff_string), "EMPTYBABY placeholder detected!" return diff_string
def create_diff_for_revision(prev_rev_id, rev_id, db_filepath): # first, retrieve the text from the database try: db = get_db(db_filepath) cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (prev_rev_id,)) result = cursor.fetchall() if len(result) > 1: # This should never happen if the database is properly constructed... raise ValueError("WARNING: Duplicated rev_id in database, check integrity.") if len(result) == 0: raise ValueError(f"Failed to find rev_id {rev_id} in database.") result = result[0] prev_content = result['content'] cursor = db.execute("SELECT rev_id, content FROM revisionText WHERE rev_id = ?", (rev_id,)) result = cursor.fetchall() if len(result) > 1: # This should never happen if the database is properly constructed... raise ValueError("WARNING: Duplicated rev_id in database, check integrity.") if len(result) == 0: raise ValueError(f"Failed to find rev_id {rev_id} in database.") result = result[0] curr_content = result['content'] finally: db.close() # second, tokenize the texts prev_tokens = wikitext_split.tokenize(prev_content) curr_tokens = wikitext_split.tokenize(curr_content) # third, identify segments that were inserted and removed, tracking the tokens that were added and subtracted all_removed_tokens = [] all_inserted_tokens = [] delete_count = 0 insert_count = 0 for segment in segment_matcher.diff(prev_tokens, curr_tokens): if segment.name == 'equal': continue elif segment.name == 'delete': removed_tokens = prev_tokens[segment.a1:segment.a2] removed_tokens.insert(0, 'REMOVAL_START') removed_tokens.append('REMOVAL_END') all_removed_tokens.extend(removed_tokens) delete_count += 1 elif segment.name == 'insert': inserted_tokens = curr_tokens[segment.b1:segment.b2] inserted_tokens.insert(0, 'INSERTION_START') inserted_tokens.append('INSERTION_END') all_inserted_tokens.extend(inserted_tokens) insert_count += 1 else: raise ValueError('Substitutions are not implemented by the segment matcher.') content_token_count = len(curr_tokens) #if len(curr_tokens) >= 100000: # TODO consider avoiding writing out very long articles return {'prev_rev_id': prev_rev_id, 'rev_id': rev_id, 'delete_count': delete_count, 'insert_count': insert_count, 'content_token_count': content_token_count, 'content_tokens': curr_tokens, 'removed_tokens': all_removed_tokens, 'inserted_tokens': all_inserted_tokens}
def segment_random(): start = time.time() for _ in range(25): operations = list(segment_matcher.diff(random1_tokens, random2_tokens)) print("\trandom: {0}".format((time.time() - start) / 25))
def segment_common(): start = time.time() for _ in range(25): operations = list(segment_matcher.diff(common1_tokens, common2_tokens)) print("\tcommon: {0}".format((time.time() - start) / 25))
def process_operations(a, b): return [op for op in segment_matcher.diff(a, b)], a, b