def count_num_ops_phrase(phrase, junk=[]): """ Counts the num of operations to apply for given phrase. """ all_num_ops = Counter() all_num_ops_abs = Counter() # Check for RearrangePhrase *before* phrase to ignore, because # RearrangePhrase has sub_phrases. The decision, if we have to ignore a # phrase should be make on the sub_phrases, not the rearrange phrase itself # (for example, a very long RearrangePhrase could contain a junk word # and hence, the whole phrase would be ignored. if isinstance(phrase, rearr.DiffRearrangePhrase): op_type, num_ops, num_ops_abs, vis = choose.apply_rearrange_phrase( phrase, junk) all_num_ops.update(num_ops) all_num_ops_abs.update(num_ops_abs) elif util.ignore_phrase(phrase, junk): op_type, num_ops, num_ops_abs, vis = choose.apply_ignored_phrase( phrase) all_num_ops.update(num_ops) all_num_ops_abs.update(num_ops_abs) elif isinstance(phrase, diff.DiffCommonPhrase): op_type, num_ops, num_ops_abs, vis = choose.apply_common_phrase(phrase) all_num_ops.update(num_ops) all_num_ops_abs.update(num_ops_abs) elif isinstance(phrase, diff.DiffReplacePhrase): op_type, num_ops, num_ops_abs, vis = choose.apply_replace_phrase( phrase) all_num_ops.update(num_ops) all_num_ops_abs.update(num_ops_abs) return all_num_ops, all_num_ops_abs
def divide_phrases_per_para(diff_phrases, junk=[]): """ Divides the given diff phrases to get phrases per paragraph. """ result = [] # Keep track of the previous actual word and of the previous target word. prev_word_actual, prev_word_target = None, None # Keep track of the previous actual word that has a counterpart in target # and of the previous target that has a counterpart in actual. prev_mated_actual, prev_mated_target = None, None # Iterate through the phrases and divide them. for phrase in diff_phrases: if util.ignore_phrase(phrase, junk): result.append(phrase) continue divided_phrases, \ prev_word_actual, prev_word_target, \ prev_mated_actual, prev_mated_target = \ divide_phrase_per_para(phrase, \ prev_word_actual, prev_word_target, \ prev_mated_actual, prev_mated_target) result.extend(divided_phrases) return result
def visualize_phrase(phrase, junk=[]): """ Visualizes the given diff phrases. """ visualizations = [] # Collect the visualization instructions per phrase. # Check for RearrangePhrase *before* phrase to ignore, because # RearrangePhrase has sub_phrases. The decision, if we have to ignore a # phrase should be make on the sub_phrases, not the rearrange phrase itself # (for example, a very long RearrangePhrase could contain a junk word # and hence, the whole phrase would be ignored. if isinstance(phrase, rearr.DiffRearrangePhrase): op_type, num_ops, num_ops_abs, vis = choose.apply_rearrange_phrase( phrase, junk) visualizations.extend(vis) elif util.ignore_phrase(phrase, junk): op_type, num_ops, num_ops_abs, vis = choose.apply_ignored_phrase( phrase) visualizations.extend(vis) elif isinstance(phrase, diff.DiffCommonPhrase): op_type, num_ops, num_ops_abs, vis = choose.apply_common_phrase(phrase) visualizations.extend(vis) elif isinstance(phrase, diff.DiffReplacePhrase): op_type, num_ops, num_ops_abs, vis = choose.apply_replace_phrase( phrase) visualizations.extend(vis) return visualizations
def visualize_diff_phrases(evaluation_result, junk=[]): """ Visualizes the given diff phrases. """ diff_phrases = evaluation_result.get("phrases", None) if diff_phrases is None: return for phrase in diff_phrases: # Decide if we apply the phrase by word operations or by paragraph # operations. phrase.ignore = False if isinstance(phrase, rearr.DiffRearrangePhrase): op_type, _, _, _ = choose.apply_rearrange_phrase(phrase, junk) phrase.op_type = op_type elif util.ignore_phrase(phrase, junk): op_type, _, _, _ = choose.apply_ignored_phrase(phrase) phrase.op_type = op_type phrase.ignore = True elif isinstance(phrase, diff.DiffCommonPhrase): op_type, _, _, _ = choose.apply_ignored_phrase(phrase) phrase.op_type = op_type elif isinstance(phrase, diff.DiffReplacePhrase): op_type, _, _, _ = choose.apply_replace_phrase(phrase) phrase.op_type = op_type # Obtain the start- and end line and column numbers in tex file. tex_line_num_start = -1 tex_line_num_end = -1 tex_column_num_start = -1 tex_column_num_end = -1 words_target = phrase.words_target if len(words_target) > 0: first_diff_word_target = words_target[0] last_diff_word_target = words_target[-1] # Line and column numbers are placed in DocWords. # The hierarchy is as follows: DiffWord > ParaWord > DocWord if first_diff_word_target is None or last_diff_word_target is None: continue first_para_word_target = first_diff_word_target.wrapped last_para_word_target = last_diff_word_target.wrapped if first_para_word_target is None or last_para_word_target is None: continue first_doc_word_target = first_para_word_target.wrapped last_doc_word_target = last_para_word_target.wrapped if first_doc_word_target is None or last_doc_word_target is None: continue tex_line_num_start = first_doc_word_target.line_num tex_column_num_start = first_doc_word_target.column_num tex_line_num_end = last_doc_word_target.line_num tex_column_num_end = last_doc_word_target.column_num phrase.tex_line_num_start = tex_line_num_start phrase.tex_column_num_start = tex_column_num_start phrase.tex_line_num_end = tex_line_num_end phrase.tex_column_num_end = tex_column_num_end visualize(evaluation_result)
def para_diff(actual, target, junk=[]): """ Finds the differences between the two given lists of paragraphs. """ # 'actual' and 'target' may be arbitrarily nested list of words. # In our case, both lists are once nested list of words of paragraphs. # example: 'actual_paras' = [['words', 'of', 'first', 'paragraph], [...]] # Flatten the list of words to be able to do a word-based diff. actual_flatten = flatten(actual) target_flatten = flatten(target) # 'actual_flatten' and 'target_flatten' are now flat lists of tuples. Each # tuple (<word>, <flat_pos>, <pos_stack>) consists of: # <word> : The word # <flat_pos> : The pos of word in flat representation of original list. # <pos_stack> : The position stack as list. The i-th element denotes the # position of the word in the original list at level i. # example: flatten([['foo', 'bar'], ['baz']]) # = [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])] # Do a word-based diff on 'actual_flatten' and 'target_flatten'. # The result is a list of diff.Common and diff.Replace objects denoting # the operations to perform to transform actual_flatten into target_flatten. # Both objects contain the related elements in 'actual_flatten' and # 'target_flatten' # # examples: # # (= [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])], # [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])]) # denotes a common object including the related elements in actual_flatten # and the elements in target_flatten. It implies that "foo bar baz" occurs # in both lists. # # (/ [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])], # [('doo', 0, [0, 0])]) # denotes a replace object including the related elements in actual_flatten # and the elements in target_flatten. It implies that "foo bar baz" in # 'actual' is replaced by "doo" in 'target'. # # One of the element lists in diff.Replace objects my be empty, denoting # either a insert or an deletion. diff_result = diff.diff(actual_flatten, target_flatten) # There could be phrases that occur in both, 'actual_flatten' and # 'target_flatten' but their order doesn't correspond. Try to identify and # rearrange such phrases. rearrange_result = rearr.rearrange(diff_result, junk) # The rearrange result is now a flat list of diff.Common, diff.Replace and # rearr.Rearrange objects and doesn't meet any paragraph structures. # So we need to split (and merge) the objects now to get operations per # paragraph. para_result = [] # Keep track of the previous actual item and the previous target item to # be able to decide where to split the objects. prev_item_actual = None prev_item_target = None for item in rearrange_result: if isinstance(item, diff.Common): item = Commons(item, prev_item_actual, prev_item_target) elif isinstance(item, rearr.Rearrange): item = Rearranges(item, prev_item_actual, prev_item_target) elif isinstance(item, diff.Replace): item = Replaces(item, prev_item_actual, prev_item_target) # TODO: Obtain the previous actual item and the previous target item. if item and item.phrases: xxx = [x for x in item.phrases if not isinstance(x, Delete) and not util.ignore_phrase(x, junk)] if xxx: last_item = xxx[-1] if last_item.items_actual: prev_item_actual = last_item.items_actual[-1] yyy = [y for y in item.phrases if not isinstance(y, Insert) and not util.ignore_phrase(y, junk)] if yyy: last_item = yyy[-1] if last_item.items_target: prev_item_target = last_item.items_target[-1] para_result.append(item) # return merge(para_result) return para_result
def apply_para_ops_rearrange_phrase(phrase, junk=[]): """ Simulates the given phrase by applying para ops. Returns the number of operations and the related visualization. """ num_para_rearranges = 1 if phrase.num_words_actual() > 0: for i in range(0, len(phrase.words_actual)): prev_word_actual = phrase.words_actual[i - 1] if i > 0 else None word_actual = phrase.words_actual[i] if is_para_break(prev_word_actual, word_actual): num_para_rearranges += 1 # Count number of operations. num_ops = Counter({"num_para_rearranges": num_para_rearranges}) num_ops_abs = Counter( {"num_para_rearranges_abs": phrase.num_words_actual()}) vis_parts = [] for sub_phrase in phrase.sub_phrases: if util.ignore_phrase(sub_phrase, junk): sub_op_type, sub_num_ops, sub_num_ops_abs, sub_vis = apply_ignored_phrase( sub_phrase) num_ops.update(sub_num_ops) num_ops_abs.update(sub_num_ops_abs) vis_parts.extend(sub_vis) elif isinstance(sub_phrase, diff.DiffCommonPhrase): sub_op_type, sub_num_ops, sub_num_ops_abs, sub_vis = apply_common_phrase( sub_phrase) num_ops.update(sub_num_ops) num_ops_abs.update(sub_num_ops_abs) vis_parts.extend(sub_vis) elif isinstance(sub_phrase, diff.DiffReplacePhrase): sub_op_type, sub_num_ops, sub_num_ops_abs, sub_vis = apply_replace_phrase( sub_phrase) num_ops.update(sub_num_ops) num_ops_abs.update(sub_num_ops_abs) vis_parts.extend(sub_vis) # Create visualization. text = "".join([v[0] for v in vis_parts]) # Define the position. pos = phrase.words_target[0].pos first_word_actual, last_word_actual = None, None if phrase.num_words_actual() > 0: first_word_actual = phrase.words_actual[0] last_word_actual = phrase.words_actual[-1] is_para_break_actual_before = is_paragraph_break_before(first_word_actual) is_para_break_actual_after = is_paragraph_break_after(last_word_actual) force_split_before = is_para_break_actual_before is False force_split_after = is_para_break_actual_after is False # TODO force_para_ops = (is_para_break_actual_before is None or is_para_break_actual_before is True) and ( is_para_break_actual_after is None or is_para_break_actual_after is True) # Apply split/merge operations. num_ops, num_ops_abs, vis = apply_split_merge(phrase, num_ops, num_ops_abs, text, force_split_before, force_split_after) vis = blue_bg(vis) return num_ops, num_ops_abs, [(vis, pos)], force_para_ops
def ignore_diff_item(item, junk): """ Checks if we have to ignore the given diff item, which may be a DiffPhrase, a SplitParagraph or a MergeParagraph. """ # Ignore the item if it is a phrase and it contains junk. return isinstance(item, diff.DiffPhrase) and util.ignore_phrase(item, junk)