예제 #1
0
def count_num_ops_phrase(phrase, junk=[]):
    """ Counts the num of operations to apply for given phrase. """

    all_num_ops = Counter()
    all_num_ops_abs = Counter()

    # Check for RearrangePhrase *before* phrase to ignore, because
    # RearrangePhrase has sub_phrases. The decision, if we have to ignore a
    # phrase should be make on the sub_phrases, not the rearrange phrase itself
    # (for example, a very long RearrangePhrase could contain a junk word
    # and hence, the whole phrase would be ignored.
    if isinstance(phrase, rearr.DiffRearrangePhrase):
        op_type, num_ops, num_ops_abs, vis = choose.apply_rearrange_phrase(
            phrase, junk)
        all_num_ops.update(num_ops)
        all_num_ops_abs.update(num_ops_abs)
    elif util.ignore_phrase(phrase, junk):
        op_type, num_ops, num_ops_abs, vis = choose.apply_ignored_phrase(
            phrase)
        all_num_ops.update(num_ops)
        all_num_ops_abs.update(num_ops_abs)
    elif isinstance(phrase, diff.DiffCommonPhrase):
        op_type, num_ops, num_ops_abs, vis = choose.apply_common_phrase(phrase)
        all_num_ops.update(num_ops)
        all_num_ops_abs.update(num_ops_abs)
    elif isinstance(phrase, diff.DiffReplacePhrase):
        op_type, num_ops, num_ops_abs, vis = choose.apply_replace_phrase(
            phrase)
        all_num_ops.update(num_ops)
        all_num_ops_abs.update(num_ops_abs)

    return all_num_ops, all_num_ops_abs
예제 #2
0
def divide_phrases_per_para(diff_phrases, junk=[]):
    """ Divides the given diff phrases to get phrases per paragraph. """
    result = []
        
    # Keep track of the previous actual word and of the previous target word.
    prev_word_actual, prev_word_target = None, None
    
    # Keep track of the previous actual word that has a counterpart in target
    # and of the previous target that has a counterpart in actual.
    prev_mated_actual, prev_mated_target = None, None
    
    # Iterate through the phrases and divide them.
    for phrase in diff_phrases:
        if util.ignore_phrase(phrase, junk):
            result.append(phrase)
            continue
    
        divided_phrases, \
            prev_word_actual, prev_word_target, \
            prev_mated_actual, prev_mated_target = \
        divide_phrase_per_para(phrase, \
            prev_word_actual, prev_word_target, \
            prev_mated_actual, prev_mated_target)
        
        result.extend(divided_phrases)

    return result
예제 #3
0
def visualize_phrase(phrase, junk=[]):
    """ Visualizes the given diff phrases. """

    visualizations = []

    # Collect the visualization instructions per phrase.

    # Check for RearrangePhrase *before* phrase to ignore, because
    # RearrangePhrase has sub_phrases. The decision, if we have to ignore a
    # phrase should be make on the sub_phrases, not the rearrange phrase itself
    # (for example, a very long RearrangePhrase could contain a junk word
    # and hence, the whole phrase would be ignored.
    if isinstance(phrase, rearr.DiffRearrangePhrase):
        op_type, num_ops, num_ops_abs, vis = choose.apply_rearrange_phrase(
            phrase, junk)
        visualizations.extend(vis)
    elif util.ignore_phrase(phrase, junk):
        op_type, num_ops, num_ops_abs, vis = choose.apply_ignored_phrase(
            phrase)
        visualizations.extend(vis)
    elif isinstance(phrase, diff.DiffCommonPhrase):
        op_type, num_ops, num_ops_abs, vis = choose.apply_common_phrase(phrase)
        visualizations.extend(vis)
    elif isinstance(phrase, diff.DiffReplacePhrase):
        op_type, num_ops, num_ops_abs, vis = choose.apply_replace_phrase(
            phrase)
        visualizations.extend(vis)

    return visualizations
예제 #4
0
def visualize_diff_phrases(evaluation_result, junk=[]):
    """ Visualizes the given diff phrases. """
                
    diff_phrases = evaluation_result.get("phrases", None)
    
    if diff_phrases is None:
        return
    
    for phrase in diff_phrases: 
        # Decide if we apply the phrase by word operations or by paragraph
        # operations.
    
        phrase.ignore = False
    
        if isinstance(phrase, rearr.DiffRearrangePhrase):    
            op_type, _, _, _ = choose.apply_rearrange_phrase(phrase, junk)
            phrase.op_type = op_type
        elif util.ignore_phrase(phrase, junk):
            op_type, _, _, _ = choose.apply_ignored_phrase(phrase)
            phrase.op_type = op_type
            phrase.ignore = True
        elif isinstance(phrase, diff.DiffCommonPhrase):
            op_type, _, _, _ = choose.apply_ignored_phrase(phrase)
            phrase.op_type = op_type
        elif isinstance(phrase, diff.DiffReplacePhrase):
            op_type, _, _, _ = choose.apply_replace_phrase(phrase)
            phrase.op_type = op_type
    
        # Obtain the start- and end line and column numbers in tex file.
        tex_line_num_start   = -1
        tex_line_num_end     = -1
        tex_column_num_start = -1
        tex_column_num_end   = -1
        
        words_target = phrase.words_target
        
        if len(words_target) > 0:
            first_diff_word_target = words_target[0]
            last_diff_word_target = words_target[-1]
            
            # Line and column numbers are placed in DocWords.
            # The hierarchy is as follows: DiffWord > ParaWord > DocWord
            
            if first_diff_word_target is None or last_diff_word_target is None:
                continue
                
            first_para_word_target = first_diff_word_target.wrapped
            last_para_word_target = last_diff_word_target.wrapped
            
            if first_para_word_target is None or last_para_word_target is None:
                continue
            
            first_doc_word_target = first_para_word_target.wrapped
            last_doc_word_target = last_para_word_target.wrapped
                         
            if first_doc_word_target is None or last_doc_word_target is None:
                continue
                                    
            tex_line_num_start   = first_doc_word_target.line_num
            tex_column_num_start = first_doc_word_target.column_num
            tex_line_num_end     = last_doc_word_target.line_num
            tex_column_num_end   = last_doc_word_target.column_num         
            
        phrase.tex_line_num_start   = tex_line_num_start
        phrase.tex_column_num_start = tex_column_num_start
        phrase.tex_line_num_end     = tex_line_num_end
        phrase.tex_column_num_end   = tex_column_num_end       
    
    visualize(evaluation_result)
예제 #5
0
def para_diff(actual, target, junk=[]):
    """ Finds the differences between the two given lists of paragraphs. """

    # 'actual' and 'target' may be arbitrarily nested list of words. 
    # In our case, both lists are once nested list of words of paragraphs.
    # example: 'actual_paras' = [['words', 'of', 'first', 'paragraph], [...]]

    # Flatten the list of words to be able to do a word-based diff. 
    actual_flatten = flatten(actual)
    target_flatten = flatten(target)

    # 'actual_flatten' and 'target_flatten' are now flat lists of tuples. Each
    # tuple (<word>, <flat_pos>, <pos_stack>) consists of:
    #   <word>      : The word
    #   <flat_pos>  : The pos of word in flat representation of original list.
    #   <pos_stack> : The position stack as list. The i-th element denotes the 
    #                 position of the word in the original list at level i.
    # example: flatten([['foo', 'bar'], ['baz']]) 
    #            = [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])] 

    # Do a word-based diff on 'actual_flatten' and 'target_flatten'. 
    # The result is a list of diff.Common and diff.Replace objects denoting
    # the operations to perform to transform actual_flatten into target_flatten.
    # Both objects contain the related elements in 'actual_flatten' and 
    # 'target_flatten'
    #
    # examples: 
    # 
    # (= [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])],
    #    [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])])
    # denotes a common object including the related elements in actual_flatten 
    # and the elements in target_flatten. It implies that "foo bar baz" occurs
    # in both lists.
    #
    # (/ [('foo', 0, [0, 0]), ('bar', 1, [0, 1]), ('baz', 2, [1, 0])],
    #    [('doo', 0, [0, 0])])
    # denotes a replace object including the related elements in actual_flatten 
    # and the elements in target_flatten. It implies that "foo bar baz" in 
    # 'actual' is replaced by "doo" in 'target'.
    #
    # One of the element lists in diff.Replace objects my be empty, denoting 
    # either a insert or an deletion.
    diff_result = diff.diff(actual_flatten, target_flatten)

    # There could be phrases that occur in both, 'actual_flatten' and 
    # 'target_flatten' but their order doesn't correspond. Try to identify and
    # rearrange such phrases.
    rearrange_result = rearr.rearrange(diff_result, junk)
    
    # The rearrange result is now a flat list of diff.Common, diff.Replace and
    # rearr.Rearrange objects and doesn't meet any paragraph structures. 
    # So we need to split (and merge) the objects now to get operations per 
    # paragraph.

    para_result = []

    # Keep track of the previous actual item and the previous target item to
    # be able to decide where to split the objects.    
    prev_item_actual = None
    prev_item_target = None
    
    for item in rearrange_result:
        if isinstance(item, diff.Common):
            item = Commons(item, prev_item_actual, prev_item_target)
        elif isinstance(item, rearr.Rearrange):
            item = Rearranges(item, prev_item_actual, prev_item_target)
        elif isinstance(item, diff.Replace):
            item = Replaces(item, prev_item_actual, prev_item_target)

        # TODO: Obtain the previous actual item and the previous target item.
        if item and item.phrases:
            xxx = [x for x in item.phrases if not isinstance(x, Delete) and not util.ignore_phrase(x, junk)]
            if xxx:
                last_item = xxx[-1]
                if last_item.items_actual:
                    prev_item_actual = last_item.items_actual[-1]

            yyy = [y for y in item.phrases if not isinstance(y, Insert) and not util.ignore_phrase(y, junk)]
            if yyy:
                last_item = yyy[-1]
                if last_item.items_target:
                    prev_item_target = last_item.items_target[-1]

        para_result.append(item)

    # return merge(para_result)
    return para_result
예제 #6
0
def apply_para_ops_rearrange_phrase(phrase, junk=[]):
    """ Simulates the given phrase by applying para ops. Returns the number
    of operations and the related visualization. """

    num_para_rearranges = 1
    if phrase.num_words_actual() > 0:
        for i in range(0, len(phrase.words_actual)):
            prev_word_actual = phrase.words_actual[i - 1] if i > 0 else None
            word_actual = phrase.words_actual[i]
            if is_para_break(prev_word_actual, word_actual):
                num_para_rearranges += 1

    # Count number of operations.
    num_ops = Counter({"num_para_rearranges": num_para_rearranges})
    num_ops_abs = Counter(
        {"num_para_rearranges_abs": phrase.num_words_actual()})

    vis_parts = []
    for sub_phrase in phrase.sub_phrases:
        if util.ignore_phrase(sub_phrase, junk):
            sub_op_type, sub_num_ops, sub_num_ops_abs, sub_vis = apply_ignored_phrase(
                sub_phrase)
            num_ops.update(sub_num_ops)
            num_ops_abs.update(sub_num_ops_abs)
            vis_parts.extend(sub_vis)
        elif isinstance(sub_phrase, diff.DiffCommonPhrase):
            sub_op_type, sub_num_ops, sub_num_ops_abs, sub_vis = apply_common_phrase(
                sub_phrase)
            num_ops.update(sub_num_ops)
            num_ops_abs.update(sub_num_ops_abs)
            vis_parts.extend(sub_vis)
        elif isinstance(sub_phrase, diff.DiffReplacePhrase):
            sub_op_type, sub_num_ops, sub_num_ops_abs, sub_vis = apply_replace_phrase(
                sub_phrase)
            num_ops.update(sub_num_ops)
            num_ops_abs.update(sub_num_ops_abs)
            vis_parts.extend(sub_vis)

    # Create visualization.
    text = "".join([v[0] for v in vis_parts])

    # Define the position.
    pos = phrase.words_target[0].pos

    first_word_actual, last_word_actual = None, None
    if phrase.num_words_actual() > 0:
        first_word_actual = phrase.words_actual[0]
        last_word_actual = phrase.words_actual[-1]

    is_para_break_actual_before = is_paragraph_break_before(first_word_actual)
    is_para_break_actual_after = is_paragraph_break_after(last_word_actual)

    force_split_before = is_para_break_actual_before is False
    force_split_after = is_para_break_actual_after is False

    # TODO
    force_para_ops = (is_para_break_actual_before is None
                      or is_para_break_actual_before is True) and (
                          is_para_break_actual_after is None
                          or is_para_break_actual_after is True)

    # Apply split/merge operations.
    num_ops, num_ops_abs, vis = apply_split_merge(phrase, num_ops, num_ops_abs,
                                                  text, force_split_before,
                                                  force_split_after)

    vis = blue_bg(vis)

    return num_ops, num_ops_abs, [(vis, pos)], force_para_ops
예제 #7
0
def ignore_diff_item(item, junk):
    """ Checks if we have to ignore the given diff item, which may be a 
    DiffPhrase, a SplitParagraph or a MergeParagraph. """
    
    # Ignore the item if it is a phrase and it contains junk.
    return isinstance(item, diff.DiffPhrase) and util.ignore_phrase(item, junk)