def print_alignments(align_count, title, graph_pair1, graph_pair2, graphs, nodes_list, out=stdout): if nodes_list: header(title, out, char="-") for nodes in nodes_list: align_count += 1 rel1 = str(graph_pair1.get_align(nodes)) rel2 = str(graph_pair2.get_align(nodes)) # tricky because of implicit coercions, # see "Formatting Markers" http://www.python.org/dev/peps/pep-0100/ print >> out, "#%d:" % align_count s = '(%s) %s [%s:%s]: "%s"' % ( nodes.source, graphs.source.node[nodes.source]["label"].encode("utf-8"), graphs.source.node[nodes.source]["begin"], graphs.source.node[nodes.source]["end"], graphs.source.get_node_token_string(nodes.source)) print >> out, s.encode("utf-8") print >> out, "<<<", rel1.upper(), "/", rel2.upper(), ">>>" s = '(%s) %s [%s:%s]: "%s"\n' % ( nodes.target, graphs.target.node[nodes.target]["label"], graphs.target.node[nodes.target]["begin"], graphs.target.node[nodes.target]["end"], graphs.target.get_node_token_string(nodes.target)) print >> out, s.encode("utf-8") return align_count
def print_alignments(align_count, title, graph_pair1, graph_pair2, graphs, nodes_list, out=stdout): if nodes_list: header(title, out, char="-") for nodes in nodes_list: align_count += 1 rel1 = str(graph_pair1.get_align(nodes)) rel2 = str(graph_pair2.get_align(nodes)) # tricky because of implicit coercions, # see "Formatting Markers" http://www.python.org/dev/peps/pep-0100/ print >>out, "#%d:" % align_count s = '(%s) %s [%s:%s]: "%s"' % ( nodes.source, graphs.source.node[nodes.source]["label"].encode("utf-8"), graphs.source.node[nodes.source]["begin"], graphs.source.node[nodes.source]["end"], graphs.source.get_node_token_string(nodes.source)) print >>out, s.encode("utf-8") print >>out, "<<<", rel1.upper(), "/", rel2.upper(), ">>>" s = '(%s) %s [%s:%s]: "%s"\n' % ( nodes.target, graphs.target.node[nodes.target]["label"], graphs.target.node[nodes.target]["begin"], graphs.target.node[nodes.target]["end"], graphs.target.get_node_token_string(nodes.target)) print >>out, s.encode("utf-8") return align_count
def write_alignment_per_relation(self, out=stdout): """ write evaluation of alignment for each relation separtately """ header("Alignment per relation", out) for rel in self.relations: self[rel].write(self.names, out=out, heading=rel.upper())
def print_comments(graph_pair, annot, out, encoding="utf-8"): try: comment = graph_pair.get_meta_data().find("comment").text except AttributeError: return if comment.strip(): header("Comments by " + annot, out, char="-") print >> out, comment.encode(encoding), "\n"
def print_comments(graph_pair, annot, out, encoding="utf-8"): try: comment = graph_pair.get_meta_data().find("comment").text except AttributeError: return if comment.strip(): header("Comments by " + annot, out, char="-") print >>out, comment.encode(encoding), "\n"
def weight(setting): """ Weight predictions @param setting: Setting instance specifying the experimental setting """ if setting.weight: log.info("\n" + header("WEIGHT STEP")) if setting.develop: weight_files( setting.dev_inst_fns, setting.dev_clas_fns, setting.weight_func, descriptor=setting.descriptor, n=setting.n, binary=setting.binary) if setting.validate: weight_files( setting.val_inst_fns, setting.val_clas_fns, setting.weight_func, descriptor=setting.descriptor, n=setting.n, binary=setting.binary)
def extract(setting): """ Extract features from corpus files, producing instance files and true corpus files. @param setting: Setting instance specifying the experimental setting """ if setting.extract: log.info("\n" + header("EXTRACT STEP")) makedirs(setting.inst_dir) makedirs(setting.true_dir) if setting.develop: inst_fns = setting.make_inst_fns(setting.dev_part_fns) true_fns = setting.make_true_fns(setting.dev_part_fns) extract_files(setting.extractor, setting.graph_selector, setting.dev_part_fns, inst_fns, true_fns, binary=setting.binary) if setting.validate: inst_fns = setting.make_inst_fns(setting.val_part_fns) true_fns = setting.make_true_fns(setting.val_part_fns) extract_files(setting.extractor, setting.graph_selector, setting.val_part_fns, inst_fns, true_fns, binary=setting.binary)
def exp_init(setting): log.info("\n" + header("INIT")) log.info("Setting at start:\n" + str(setting) + "\n") buf = StringIO.StringIO() buf.write("feature description:\n") setting.descriptor.pprint(buf) log.info(buf.getvalue())
def evaluate(setting): """ Evaluate development data @param setting: Setting instance specifying the experimental setting """ if setting.evaluate: log.info("\n" + header("EVALUATE STEP")) makedirs(setting.eval_dir) if setting.develop: setting.dev_eval = eval_files( setting.dev_true_fns, setting.dev_pred_fns, setting.dev_eval_fname, align_eval=setting.evaluator, n=setting.n) if setting.validate: setting.val_eval = eval_files( setting.val_true_fns, setting.val_pred_fns, setting.val_eval_fname, align_eval=setting.evaluator, n=setting.n)
def extract(setting): """ Extract features from corpus files, producing instance files and true corpus files. @param setting: Setting instance specifying the experimental setting """ if setting.extract: log.info("\n" + header("EXTRACT STEP")) makedirs(setting.inst_dir) makedirs(setting.true_dir) if setting.develop: inst_fns = setting.make_inst_fns(setting.dev_part_fns) true_fns = setting.make_true_fns(setting.dev_part_fns) extract_files( setting.extractor, setting.graph_selector, setting.dev_part_fns, inst_fns, true_fns, binary=setting.binary) if setting.validate: inst_fns = setting.make_inst_fns(setting.val_part_fns) true_fns = setting.make_true_fns(setting.val_part_fns) extract_files( setting.extractor, setting.graph_selector, setting.val_part_fns, inst_fns, true_fns, binary=setting.binary)
def merge(setting): """ Merge data @param setting: Setting instance specifying the experimental setting """ if setting.merge: log.info("\n" + header("MERGE STEP")) makedirs(setting.pred_dir) if setting.develop: pred_fns = setting.make_pred_fns(setting.dev_true_fns) merge_files( setting.dev_inst_fns, setting.dev_true_fns, pred_fns, merger=setting.merger, descriptor=setting.descriptor, n=setting.n, binary=setting.binary) if setting.validate: pred_fns = setting.make_pred_fns(setting.val_true_fns) merge_files( setting.val_inst_fns, setting.val_true_fns, pred_fns, merger=setting.merger, descriptor=setting.descriptor, n=setting.n, binary=setting.binary)
def sample(setting): """ Sample training data @param setting: Setting instance specifying the experimental setting """ if setting.sample: log.info("\n" + header("SAMPLE STEP")) makedirs(setting.samp_dir) if setting.develop: samp_fns = setting.make_samp_fns(setting.dev_inst_fns) sample_file(setting.class_fracts, setting.dev_inst_fns, samp_fns) if setting.validate: samp_fns = setting.make_samp_fns(setting.val_inst_fns) sample_file(setting.class_fracts, setting.val_inst_fns, samp_fns)
def create_parts(setting): """ Create the parallel graph corpora constituting the data parts for development and validation @param setting: Setting instance specifying the experimental setting """ if setting.part: log.info("\n" + header("PARTING STEP")) if setting.develop: create_part_files( setting.dev_parts, base_dir=setting.corpus_dir, part_dir=setting.part_dir, max_size=setting.part_max_size) if setting.validate: create_part_files( setting.val_parts, base_dir=setting.corpus_dir, part_dir=setting.part_dir, max_size=setting.part_max_size)
def match(setting): """ Match data @param setting: Setting instance specifying the experimental setting """ if setting.match: log.info("\n" + header("MATCH STEP")) if setting.develop: match_files( setting.dev_inst_fns, setting.matcher, descriptor=setting.descriptor, n=setting.n, binary=setting.binary) if setting.validate: match_files( setting.val_inst_fns, setting.matcher, descriptor=setting.descriptor, n=setting.n, binary=setting.binary)
def classify(setting): """ Classify corpus instances @param setting: Setting instance specifying the experimental setting """ if setting.classify: log.info("\n" + header("CLASSIFY STEP")) makedirs(setting.clas_dir) if setting.train_sample: train_inst_fns = setting.dev_samp_fns else: train_inst_fns = setting.dev_inst_fns if setting.develop: classify_file_cv( train_inst_fns, test_inst_fns=setting.dev_inst_fns, out_fns=setting.make_out_fns(setting.dev_inst_fns), log_fns=setting.make_log_fns(setting.dev_inst_fns), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, n=setting.n, log=setting.timbl_log) if setting.validate: classify_file( train_inst_fns, setting.val_inst_fns, out_fns=setting.make_out_fns(setting.val_inst_fns), log_fn=setting.make_log_fname(setting.val_inst_fns[0]), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, log=setting.timbl_log)
def classify(setting): """ Classify corpus instances @param setting: Setting instance specifying the experimental setting """ if setting.classify: log.info("\n" + header("CLASSIFY STEP")) makedirs(setting.clas_dir) if setting.train_sample: train_inst_fns = setting.dev_samp_fns else: train_inst_fns = setting.dev_inst_fns if setting.develop: classify_file_cv( train_inst_fns, test_inst_fns=setting.dev_inst_fns, out_fns=setting.make_out_fns(setting.dev_inst_fns), log_fns=setting.make_log_fns(setting.dev_inst_fns), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, n=setting.n, log=setting.timbl_log) if setting.validate: classify_file(train_inst_fns, setting.val_inst_fns, out_fns=setting.make_out_fns(setting.val_inst_fns), log_fn=setting.make_log_fname( setting.val_inst_fns[0]), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, log=setting.timbl_log)
def write_alignment_overall(self, out=stdout, percent=True): """ write evaluation summary of alignment over all relations """ width = 14 separator = 4 * width * "-" + "\n" header("Alignment over all relations", out) # write counts out.write("Relation:".ljust(width)) for c in AlignCounts.count_keys: c = c.capitalize() + ":" out.write(c.rjust(width)) out.write('\n' + separator) for rel, align_counts in zip(self.relations, self): out.write(rel.ljust(width)) for c in AlignCounts.count_keys: s = str(self[rel].count_stats[c]["sum"]) out.write(s.rjust(width)) out.write('\n') out.write(separator) out.write("Sum:".ljust(width)) for k in AlignCounts.count_keys: s = str(self.count_stats[k]["sum"]) out.write(s.rjust(width)) out.write('\n\n\n') # write measures out.write("Relation:".ljust(width)) for c in AlignCounts.measure_keys: c = c.capitalize() + ":" out.write(c.rjust(width)) out.write('\n' + separator) for rel, align_counts in zip(self.relations, self): out.write(rel.ljust(width)) for m in AlignCounts.measure_keys: # repport the relation's micro mean here value = self[rel].measure_stats[m]["micro"]["mean"] if percent: value *= 100 s = "%.2f" % value out.write(s.rjust(width)) out.write('\n') out.write(separator) for method in self.measure_stat_methods: for stat in self.measure_stat_keys: s = method.capitalize()+ " " + stat.capitalize() + ":" out.write(s.ljust(width)) for m in AlignCounts.measure_keys: value = self.measure_stats[m][method][stat] if percent: value *= 100 s = "%.2f" % value out.write(s.rjust(width)) out.write('\n') out.write('\n\n')
def write_alignment_only(self, out=stdout): """ write evaluation of alignment only, irrespective of relation labels """ header("Alignment only (regardlless of relation)", out) self[AlignEval.any_rel].write(self.names, out=out)
def write_alignment_overall(self, out=stdout, percent=True): """ write evaluation summary of alignment over all relations """ width = 14 separator = 4 * width * "-" + "\n" header("Alignment over all relations", out) # write counts out.write("Relation:".ljust(width)) for c in AlignCounts.count_keys: c = c.capitalize() + ":" out.write(c.rjust(width)) out.write('\n' + separator) for rel, align_counts in zip(self.relations, self): out.write(rel.ljust(width)) for c in AlignCounts.count_keys: s = str(self[rel].count_stats[c]["sum"]) out.write(s.rjust(width)) out.write('\n') out.write(separator) out.write("Sum:".ljust(width)) for k in AlignCounts.count_keys: s = str(self.count_stats[k]["sum"]) out.write(s.rjust(width)) out.write('\n\n\n') # write measures out.write("Relation:".ljust(width)) for c in AlignCounts.measure_keys: c = c.capitalize() + ":" out.write(c.rjust(width)) out.write('\n' + separator) for rel, align_counts in zip(self.relations, self): out.write(rel.ljust(width)) for m in AlignCounts.measure_keys: # repport the relation's micro mean here value = self[rel].measure_stats[m]["micro"]["mean"] if percent: value *= 100 s = "%.2f" % value out.write(s.rjust(width)) out.write('\n') out.write(separator) for method in self.measure_stat_methods: for stat in self.measure_stat_keys: s = method.capitalize() + " " + stat.capitalize() + ":" out.write(s.ljust(width)) for m in AlignCounts.measure_keys: value = self.measure_stats[m][method][stat] if percent: value *= 100 s = "%.2f" % value out.write(s.rjust(width)) out.write('\n') out.write('\n\n')
def pgc_diff(corpus1, corpus2, corpus_name1="Corpus1", corpus_name2="Corpus2", annot1="Annot1", annot2="Annot2", words_only=False, show_comments=False, show_ident=False, relations=None, out=stdout): """ reports the differences (and optionally the similarities) between the labeled alignments from two parallel graph corpora """ assert len(corpus1) == len(corpus2) if not relations: relations = corpus1.get_relations() # counter for numbering the alignments when printing; # may be less than the actual number of alignments when identical alignments # are not printed (cf. show_ident option) align_count = 0 # counter for numbering the graph pairs when printing pair_count = 0 header("%s corpus: %s\n%s corpus: %s" % (annot1, corpus_name1, annot2, corpus_name2), width=120, char="#") for graph_pair1, graph_pair2 in zip(corpus1, corpus2): # assume that the corpora have the same graph pairs in the same order, # so the only difference is in the aligned nodes assert graph_pair1._banks == graph_pair2._banks assert graph_pair1._graphs_equal(graph_pair2) pair_count += 1 ident = [] rel_diff = [] uniq1 = [] uniq2 = [] # recall that graphs are identical graphs = graph_pair1.get_graphs() for nodes, rel1 in graph_pair1.alignments_iter(relations=relations): if ( words_only and graphs.source.node_node_is_non_terminal(nodes.source) and graphs.target.node_is_non_terminal(nodes.target) ): continue rel2 = graph_pair2.get_align(nodes) if not rel2: uniq1.append(nodes) elif rel1 == rel2: ident.append(nodes) else: rel_diff.append(nodes) for nodes, rel2 in graph_pair2.alignments_iter(relations=relations): if ( words_only and ( graphs.source.node_is_terminal(nodes.source) or graphs.target.node_is_terminal(nodes.target) )): continue if not graph_pair1.get_align(nodes): uniq2.append(nodes) #if not ( ident and rel_diff and uniq1 and uniq2 and show_comments ): # continue header("Graph pair %d" % pair_count, width=120, char="=") print >>out, graphs.source.get_graph_token_string().encode("utf-8"), "\n" print >>out, graphs.target.get_graph_token_string().encode("utf-8"), "\n" if show_comments: print_comments(graph_pair1, annot1, out) print_comments(graph_pair2, annot2, out) if show_ident: ident.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, "Identical", graph_pair1, graph_pair2, graphs, ident, out) rel_diff.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, "Relation different", graph_pair1, graph_pair2, graphs, rel_diff, out) uniq1.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, annot1 + " only", graph_pair1, graph_pair2, graphs, uniq1, out) uniq2.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, annot2 + " only", graph_pair1, graph_pair2, graphs, uniq2, out)
def pgc_diff(corpus1, corpus2, corpus_name1="Corpus1", corpus_name2="Corpus2", annot1="Annot1", annot2="Annot2", words_only=False, show_comments=False, show_ident=False, relations=None, out=stdout): """ reports the differences (and optionally the similarities) between the labeled alignments from two parallel graph corpora """ assert len(corpus1) == len(corpus2) if not relations: relations = corpus1.get_relations() # counter for numbering the alignments when printing; # may be less than the actual number of alignments when identical alignments # are not printed (cf. show_ident option) align_count = 0 # counter for numbering the graph pairs when printing pair_count = 0 header("%s corpus: %s\n%s corpus: %s" % (annot1, corpus_name1, annot2, corpus_name2), width=120, char="#") for graph_pair1, graph_pair2 in zip(corpus1, corpus2): # assume that the corpora have the same graph pairs in the same order, # so the only difference is in the aligned nodes assert graph_pair1._banks == graph_pair2._banks assert graph_pair1._graphs_equal(graph_pair2) pair_count += 1 ident = [] rel_diff = [] uniq1 = [] uniq2 = [] # recall that graphs are identical graphs = graph_pair1.get_graphs() for nodes, rel1 in graph_pair1.alignments_iter(relations=relations): if (words_only and graphs.source.node_node_is_non_terminal(nodes.source) and graphs.target.node_is_non_terminal(nodes.target)): continue rel2 = graph_pair2.get_align(nodes) if not rel2: uniq1.append(nodes) elif rel1 == rel2: ident.append(nodes) else: rel_diff.append(nodes) for nodes, rel2 in graph_pair2.alignments_iter(relations=relations): if (words_only and (graphs.source.node_is_terminal(nodes.source) or graphs.target.node_is_terminal(nodes.target))): continue if not graph_pair1.get_align(nodes): uniq2.append(nodes) #if not ( ident and rel_diff and uniq1 and uniq2 and show_comments ): # continue header("Graph pair %d" % pair_count, width=120, char="=") print >> out, graphs.source.get_graph_token_string().encode( "utf-8"), "\n" print >> out, graphs.target.get_graph_token_string().encode( "utf-8"), "\n" if show_comments: print_comments(graph_pair1, annot1, out) print_comments(graph_pair2, annot2, out) if show_ident: ident.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, "Identical", graph_pair1, graph_pair2, graphs, ident, out) rel_diff.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, "Relation different", graph_pair1, graph_pair2, graphs, rel_diff, out) uniq1.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, annot1 + " only", graph_pair1, graph_pair2, graphs, uniq1, out) uniq2.sort(cmp=cmp_nodes) align_count = print_alignments(align_count, annot2 + " only", graph_pair1, graph_pair2, graphs, uniq2, out)
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[:setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)
def exp_exit(setting): log.info("\n" + header("EXIT")) pickle(setting) feat_weight_graphs(setting) log.info("Setting at end:\n" + str(setting) + "\n")
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)