def main(ptb_file, results_dir, annotations_dir): correct_patterns = set() for root, dirs, files in os.walk(results_dir): for f in files: if 'correct_' not in f: continue for line in open(root + '/' + f): correct_patterns.add(line.strip()[1:-1]) pattern_heads = dict() for root, dirs, files in os.walk(annotations_dir): for f in files: for line in open(root + '/' + f): if line.startswith('i'): continue # skip the header index, pattern, head = line.strip().split('\t') pattern_heads[pattern[1:-1]] = int(head) - 1 # 0-index the head trees = [] text = '' skipped = 0 for line in open(ptb_file): if text and line[0] != ' ': try: trees.append(Tree.read(text)) except AttributeError: #print text skipped += 1 text = '' text += line if text: trees.append(Tree.read(text)) good_trees = [] for tree in trees: if tree_is_good(tree.root.children[0], pattern_heads): #if tree_is_good(tree.root.children[0], correct_patterns): good_trees.append(tree) print 'Number of trees:', len(trees) print 'Number of good trees:', len(good_trees) print 'Skipped:', skipped cats = error_category_counts.keys() cats.sort(key=lambda x: error_category_counts[x]) for cat in cats: print '%s: %d' % (cat, error_category_counts[cat]) print errors = error_counts.keys() errors.sort(key=lambda x: error_counts[x], reverse=True) for error in errors[:30]: print '%s: %d' % (error, error_counts[error]) out = open('good_trees.mrg', 'w') for tree in good_trees: out.write(tree.pretty()) out.write('\n\n') out.close() shuffle(good_trees) num_examples = 100 out = open('marked_example_trees.mrg', 'w') for tree in good_trees[:num_examples]: mark_heads(tree.root.children[0], pattern_heads) out.write(tree.pretty()) out.write('\n\n') out.close()
def main(): annotations = Annotation.objects.select_related().filter(head_correct=True) for annotation in annotations: expansion = annotation.expansion root = None for i, line in enumerate(expansion.supa_example.split('\n')): if 'ROOT' in line: if root != None: print 'Two roots found; skipping' continue root = i if root == None: # Sometimes the SUPA is empty; testing on 3/20/2013 showed that was # the only time this happened continue tree = Tree.read(expansion.penn_example) head_index = None for i, child in enumerate(tree.root.children): if root in child.terminal_indices(): head_index = i+1 break if head_index: annotation.head_index = head_index annotation.save() else: print 'Head index not found...' transaction.commit()
def simplify_trees(suite_file, outfile): new_trees = [] tree = '' for line in open(suite_file): if line == '\n': new_trees.append(convert_tree(Tree.read(tree))) tree = '' tree += line out = open(outfile, 'w') for tree in new_trees: out.write(tree.pretty()) out.write('\n\n')
def main(category, suites_dir): tree_file = None for root, dirs, files in os.walk('.'): for f in files: if 'sierra_postop' in f: tree_file = f if not tree_file: print 'Could not find sierra_postop file! Exiting...' exit(-1) trees = [] patterns = [] text = '' for line in open(tree_file): if line == 'null\n': print 'Null found!' trees.append(None) continue if line == '\n': trees.append(Tree.read(text)) text = '' continue text += line new_trees = [] for i, tree in enumerate(trees): # Clear off some of the extra processing that the SUPA pipeline adds if tree is None: new_trees.append('') continue root = tree.root.children[1] clear_extra_labels(root) new_trees.append(root) outfile = '../' + suites_dir + '/' + category + '/' + category outfile += '_PTBtrees_intermediate.mrg' out = open(outfile, 'w') for tree in new_trees: if tree: out.write(tree.pretty()) out.write('\n\n') else: out.write('\n') out.close()
def main(annotation_file, category): outfile = 'results/results.tsv' tree_file = None for root, dirs, files in os.walk('.'): for f in files: if 'sierra_postop' in f: tree_file = f if not tree_file: print 'Could not find sierra_postop file! Exiting...' exit(-1) annotations = {} annotation_patterns = {} for line in open(annotation_file): if line.startswith('index'): continue index, pattern, head_index = line.strip().split('\t') annotations[int(index)] = int(head_index) - 1 annotation_patterns[pattern] = int(head_index) - 1 trees = [] patterns = [] text = '' for line in open(tree_file): if line == 'null\n': trees.append(None) continue if line == '\n': trees.append(Tree.read(text)) text = '' continue text += line count_file = '../test_suites_v2/%s/%s_tagAsParent_rules_grouped.txt' % ( category, category) counts = [] i = 0 for line in open(count_file): count, pattern, _ = line.split('\t') counts.append(int(count)) # TODO: this could be better - like check the annotation file to be # sure that the patterns match patterns.append(pattern) i += 1 if len(counts) != len(trees): print 'Error! Incorrect alignment between trees and counts:' print len(counts), len(trees) exit(-1) # 'count' is token count, 'num' is type count total_count = 0 count_annotated = 0 count_correct = 0 num_patterns = 0 num_annotated = 0 num_correct = 0 errors = [] correct = [] for i, tree in enumerate(trees): pattern = patterns[i] num_patterns += 1 total_count += counts[i] index = i + 1 if index not in annotations: continue num_annotated += 1 count_annotated += counts[i] # Clear off some of the extra processing that the SUPA pipeline adds if tree is None: continue root = tree.root.children[1] head = root.label.split('__', 1)[1] head_index = -1 # The labels are on trees that haven't had WH-movement undone - we have # to correct the annotations for that. This isn't perfect, but it will # do for now. annotated_children = len(pattern.split()) - 1 actual_children = len(root.children) is_conjpp = 'CONJPP' in [x.label.split('__')[0] for x in root.children] if (not is_conjpp and actual_children == annotated_children - 1 and annotations[index] != 0): annotations[index] = annotations[index] - 1 # Now to actually check to see what was labeled as the head for j, child in enumerate(root.children): child_head = child.label.split('__', 1)[1] if child_head == head: head_index = j + 1 if j == annotations[index]: correct.append(patterns[i]) num_correct += 1 count_correct += counts[i] break else: errors.append((patterns[i], head_index, annotations[index]+1)) percent_tested = num_annotated / num_patterns percent_correct = num_correct / num_annotated count_percent_annotated = count_annotated / total_count count_percent_correct = count_correct / count_annotated out = open(outfile, 'a') out.write('%s\t%d\t%d\t%.3f\t%d\t%.3f\t%d\t%d\t%.3f\t%d\t%.3f\n' % ( category, num_patterns, num_annotated, percent_tested, num_correct, percent_correct, total_count, count_annotated, count_percent_annotated, count_correct, count_percent_correct)) error_file = open('results/errors_%s.tsv' % category, 'w') error_file.write('pattern\tpredicted\tactual\n') for error in errors: error_file.write('%s\t%d\t%d\n' % error); correct_file = open('results/correct_%s.tsv' % category, 'w') for pattern in correct: correct_file.write('%s\n' % pattern);