def compare_trees(gold_tree, test_tree, out_dict, error_counts, classify): """ Compares two trees. """ init_errors = parse_errors.get_errors(test_tree, gold_tree) error_count = len(init_errors) print >> out_dict['out'], "{} Initial errors".format(error_count) iters, path = greedy_search(gold_tree, test_tree, classify) print >> out_dict['out'], "{} on fringe, {} iterations".format(*iters) if path is not None: print >> out_dict['test_trees'], test_tree print >> out_dict['gold_trees'], gold_tree for tree in path[1:]: print >> out_dict['out'], "{} Error:{}".format( str(tree[2]), tree[1]['classified_type']) if len(path) > 1: for tree in path: print >> out_dict['out'], "Step:{}".format( tree[1]['classified_type']) error_counts[tree[1]['classified_type']].append(tree[2]) print >> out_dict['out'], tree[1] print >> out_dict['out'], render_tree.text_coloured_errors( tree[0], gold=gold_tree).strip() else: print >> out_dict['out'], "no path found" print >> out_dict['err'], "" print >> out_dict['out'], ""
def greedy_search(gold, test, classify): # Initialise with the test tree cur = (test.clone(), {'type': 'init'}, 0) # Search while there is still something in the fringe iters = 0 path = [] while True: path.append(cur) if iters > 100: return (0, iters), None # Check for victory ctree = cur[0] cerrors = parse_errors.ParseErrorSet(gold, ctree) if len(cerrors) == 0: final = cur break best = None for fixes, ntree, info in successors(ctree, cerrors, gold): if not ntree.check_consistency(): raise Exception("Inconsistent tree! {}".format(ntree)) nerrors = parse_errors.get_errors(ntree, gold) change = len(cerrors) - len(nerrors) if change < 0: continue if best is None or change > best[2]: best = (ntree, info, change) cur = best iters += 1 for step in path: classify(step[1], gold, test) return (0, iters), path
def greedy_search(gold, test, classify): # Initialise with the test tree cur = (test.clone(), {'type': 'init'}, 0) # Search while there is still something in the fringe iters = 0 path = [] while True: path.append(cur) if iters > 100: return (0, iters), None # Check for victory ctree = cur[0] cerrors = parse_errors.Parse_Error_Set(gold, ctree) if len(cerrors) == 0: final = cur break best = None for fixes, ntree, info in successors(ctree, cerrors, gold): if not ntree.check_consistency(): raise Exception("Inconsistent tree! {}".format(ntree)) nerrors = parse_errors.get_errors(ntree, gold) change = len(cerrors) - len(nerrors) if change < 0: continue if best is None or change > best[2]: best = (ntree, info, change) cur = best iters += 1 for step in path: classify(step[1], gold, test) return (0, iters), path
def compare_trees(gold_tree, test_tree, out_dict, error_counts, classify): """ Compares two trees. """ init_errors = parse_errors.get_errors(test_tree, gold_tree) error_count = len(init_errors) print >> out_dict['out'], "{} Initial errors".format(error_count) iters, path = greedy_search(gold_tree, test_tree, classify) print >> out_dict['out'], "{} on fringe, {} iterations".format(*iters) if path is not None: print >> out_dict['test_trees'], test_tree print >> out_dict['gold_trees'], gold_tree for tree in path[1:]: print >> out_dict['out'], "{} Error:{}".format(str(tree[2]),tree[1]['classified_type']) if len(path) > 1: for tree in path: print >> out_dict['out'], "Step:{}".format(tree[1]['classified_type']) error_counts[tree[1]['classified_type']].append(tree[2]) print >> out_dict['out'], tree[1] print >> out_dict['out'], render_tree.text_coloured_errors(tree[0], gold=gold_tree).strip() else: print >> out_dict['out'], "no path found" print >> out_dict['err'], "" print >> out_dict['out'], ""
def text_coloured_errors(tree, gold=None, depth=0, single_line=False, missing=None, extra=None, compressed=True, POS=True): """Pretty print, with errors marked using colour. 'missing' should contain tuples (or be None): (start, end, label, crossing-T/F) """ # TODO: Add the ability to compress the same parts consistently (even after # errors are no longer present). This would need to be span based as # structure could change. ans = '' if missing is None or extra is None: if gold is None: return "Error - no gold tree and no missing list for colour repr" # look at gold and work out what missing should be errors = parse_errors.get_errors(tree, gold, POS) extra = [e[3] for e in errors if e[0] == 'extra' and e[3].word is None] extra = set(extra) missing = [(e[1][0], e[1][1], e[2], False) for e in errors if e[0] == 'missing' and e[3].word is None] missing += [(e[1][0], e[1][1], e[2], True) for e in errors if e[0] == 'crossing' and e[3].word is None] POS = [e for e in errors if e[0] == 'diff POS'] start_missing = "\033[01;36m" start_extra = "\033[01;31m" start_crossing = "\033[01;33m" end_colour = "\033[00m" if not single_line: ans += '\n' + depth * '\t' # start of this if tree in extra: ans += start_extra + '(' + tree.label + end_colour elif tree.word is not None and POS is not None: found = False for error in POS: if error[3] == tree: found = True ans += '(' + start_missing + error[4] + end_colour ans += ' ' + start_extra + tree.label + end_colour break if not found: ans += '(' + tree.label else: ans += '(' + tree.label # If we are compressing, check for correctness and then just print words sub_done = False if compressed and tree not in extra and tree.word is None: all_right = True for error in extra: if tree.span[0] <= error.span[0] and error.span[1] <= tree.span[1]: all_right = False break for error in missing: if error[3]: if tree.span[0] < error[0] < tree.span[1]: all_right = False break if tree.span[0] < error[1] < tree.span[1]: all_right = False break elif tree.span[0] <= error[0] and error[1] <= tree.span[1]: all_right = False break if POS is not None: for error in POS: if tree.span[0] <= error[1][0] and error[1][1] <= tree.span[1]: all_right = False break if all_right: ans += ' ' + text_words(tree) + ')' sub_done = True # crossing brackets starting if tree.parent is None or tree.parent.subtrees[0] != tree: # these are marked as high as possible labels = [] for error in missing: if error[0] == tree.span[0] and error[3]: labels.append((error[1], error[2])) labels.sort(reverse=True) if len(labels) > 0: to_add = start_crossing + ' '.join( ['(' + label[1] for label in labels]) + end_colour if sub_done: nans = '' for char in ans: if char in '\t\n': nans += char clen = len(nans) nans += to_add nans += ' ' + ans[clen:] ans = nans else: ans += ' ' + to_add if not sub_done: # word if tree.word is not None: ans += ' ' + tree.word # subtrees below = [] for subtree in tree.subtrees: text = text_coloured_errors(subtree, gold, depth + 1, single_line, missing, extra, compressed, POS) if single_line: text = ' ' + text below.append([subtree.span[0], subtree.span[1], text]) # add missing brackets that surround subtrees for length in range(1, len(below)): for i in range(len(below)): j = i + length if i == 0 and j == len(below) - 1: continue if j >= len(below): break for error in missing: if below[i][0] == error[0] and below[j][1] == error[ 1] and not error[3]: start = '' for char in below[i][2]: if char not in '\n\t': break start += char for k in range(i, j + 1): below[k][2] = '\n\t'.join(below[k][2].split('\n')) below[i][2] = start + start_missing + '(' + error[ 2] + end_colour + below[i][2] below[j][2] += start_missing + ')' + end_colour ans += ''.join([part[2] for part in below]) # end of this if tree in extra: ans += start_extra + ')' + end_colour else: ans += ')' if tree.parent is None or tree.parent.subtrees[-1] != tree: # if there are crossing brackets that end here, mark that labels = [] for error in missing: if error[1] == tree.span[1] and error[3]: labels.append((-error[0], error[2])) labels.sort() if len(labels) > 0: ans += ' ' + start_crossing + ' '.join( [label[1] + ')' for label in labels]) + end_colour # TODO: Change so that at the top level, # FRAG etc isn't printed outside of ROOT # Actually, just have a canonical ordering for unaries # (so that NPs end up under FRAGs) if tree.parent is None or len(tree.parent.subtrees) > 1: # check for missing brackets that go around this node for error in missing: if (error[0] == tree.span[0] and error[1] == tree.span[1] and not error[3]): if tree not in extra: # Put them on a new level extra_text = '' if not single_line: ans = '\n\t'.join(ans.split('\n')) extra_text = '\n' + depth * '\t' extra_text += start_missing + '(' + error[2] + end_colour if single_line: ans = ' ' + ans ans = extra_text + ans ans += start_missing + ')' + end_colour else: # Put them on the same line start = 0 for char in ans: if char not in '\n\t': break start += 1 pretext = ans[:start] ans = ans[start:] extra_text = start_missing + '(' + error[ 2] + end_colour + ' ' ans = pretext + extra_text + ans ans += start_missing + ')' + end_colour return ans