def test_classify(self): corpus_inst = CorpusInst() descriptor = create_setting().descriptor corpus_inst.loadtxt("exp/inst/dev001.inst", descriptor.dtype) graph_inst = corpus_inst[0] # clear relevant fields graph_inst["pred_relation"] = "None" graph_inst["pred_weight"] = 0.0 # backup original for comparison later on pred_before = graph_inst["pred_relation"].copy() weight_before = graph_inst["pred_weight"].copy() classifier = TimblClassifier(descriptor, "exp/inst/dev001.inst") classifier.classify(graph_inst) # delete classifier to make sure that server is killed, # even if test fails del classifier # check that at least one prediction is different (i.e. not None) self.assertTrue(any(graph_inst["pred_relation"] != pred_before)) # check that at least one weight is different (i.e. not 0.0) self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
def weight_files(inst_fns, out_fns, weight_func=entropy_weight, descriptor=None, n=None, binary=False): """ Weight corpus instance files @param inst_fns: list of corpus instance filenames @param out_fns: list of filenames containing Timbl output @keyword weight_func: weighting fuction @keyword descriptor: a Descriptor instance, required if corpus instances are loaded in text format @keyword n: limit merging to the first n files """ for inst_fname, out_fname in zip(inst_fns, out_fns)[:n]: corpus_inst = CorpusInst() if binary: corpus_inst.loadbin(inst_fname) else: corpus_inst.loadtxt(inst_fname, descriptor.dtype) timbl_out = parse_timbl_output(open(out_fname)) weight_corpus(corpus_inst, timbl_out, weight_func) log.info("saving weighted corpus instances to {0}".format(inst_fname)) corpus_inst.save()
def test_extract_dev(self): st = create_setting() st.validate = False # create tmp dirs for extraction output st.inst_dir = tempfile.mkdtemp() st.true_dir = tempfile.mkdtemp() extract(st) # check no of files self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns)) self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns)) # test loading a corpus file corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0]) # test loading a instances file inst = CorpusInst() inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype) self.assertEqual(len(corpus), len(inst)) clean_inst(st) clean_true(st)
def test_merge(self): corpus_inst = CorpusInst() dtype = create_setting().descriptor.dtype corpus_inst.loadtxt("exp/inst/dev001.inst", dtype) graph_inst = corpus_inst[0] pgc = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc") graph_pair = pgc[0] gm = Merger() gm.merge(graph_inst, graph_pair)
def test_extract_with_pp_graph_hooks(self): """ test of extracting feature with preprocessing hook """ st = create_setting() st.validate = False # create tmp dirs for extraction output st.inst_dir = tempfile.mkdtemp() st.true_dir = tempfile.mkdtemp() # a preprocessing function which insert an attribute "x" with value # "y" on every node inthe graphs def pp_hook1(graphs): for g in graphs: for attrs in g.node.values(): attrs[u"x"] = u"y" # a feature function which relies on the pp_hook above def ff_x(nodes, graphs, **kwargs): return graphs.source.node[nodes.source][u"x"] # create a feature description f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1]) # add to features; descriptor and extractor are automatically derived st.features = (f,) extract(st) # check no of files self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns)) self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns)) # test loading a corpus file corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0]) # test loading a instances file inst = CorpusInst() inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype) self.assertEqual(len(corpus), len(inst)) # check values produced by preprocessing function self.assertTrue(all(inst[0]["x"] == "y")) clean_inst(st) clean_true(st)
def test_match(self): corpus_inst = CorpusInst() dtype = create_setting().descriptor.dtype corpus_inst.loadtxt("exp/inst/dev001.inst", dtype) graph_inst = corpus_inst[0] graph_inst["match_relation"] = "None" match_before = graph_inst["match_relation"].copy() gm = Matcher() gm.match(graph_inst) ## print graph_inst["match_relation"] # check that at least one match is different self.assertTrue(any(graph_inst["match_relation"] != match_before))
def test_match_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) graph_inst = corpus_inst[0] # clear pred_match field graph_inst["match_relation"] = str(None) # backup original for comparison later on match_before = graph_inst["match_relation"].copy() match_corpus(corpus_inst, Matcher()) # check that at least one relation is different (i.e. not None) self.assertTrue(any(graph_inst["match_relation"] != match_before))
def test_extract_with_pp_graph_hooks(self): """ test of extracting feature with preprocessing hook """ st = create_setting() st.validate = False # create tmp dirs for extraction output st.inst_dir = tempfile.mkdtemp() st.true_dir = tempfile.mkdtemp() # a preprocessing function which insert an attribute "x" with value # "y" on every node inthe graphs def pp_hook1(graphs): for g in graphs: for attrs in g.node.values(): attrs[u"x"] = u"y" # a feature function which relies on the pp_hook above def ff_x(nodes, graphs, **kwargs): return graphs.source.node[nodes.source][u"x"] # create a feature description f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1]) # add to features; descriptor and extractor are automatically derived st.features = (f, ) extract(st) # check no of files self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns)) self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns)) # test loading a corpus file corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0]) # test loading a instances file inst = CorpusInst() inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype) self.assertEqual(len(corpus), len(inst)) # check values produced by preprocessing function self.assertTrue(all(inst[0]["x"] == "y")) clean_inst(st) clean_true(st)
def test_merge_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) true_fname = st.dev_true_fns[0] true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) self.assertTrue(len(pred_corpus)) for graph_inst, graph_pair in zip(corpus_inst, pred_corpus): for inst in graph_inst: rel = inst["match_relation"] if rel != str(None): nodes = Pair(inst["source_node"], inst["target_node"]) self.assertEqual(graph_pair.get_align(nodes), rel)
def test_merge_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) true_fname = st.dev_true_fns[0] true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) self.assertTrue(len(pred_corpus)) for graph_inst, graph_pair in zip(corpus_inst, pred_corpus): for inst in graph_inst: rel = inst["match_relation"] if rel != str(None): nodes = Pair(inst["source_node"], inst["target_node"] ) self.assertEqual(graph_pair.get_align(nodes), rel)
def test_weight_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) graph_inst = corpus_inst[0] # clear predicted weights field graph_inst["pred_weight"] = 0.0 # backup original for comparison later on weight_before = graph_inst["pred_weight"].copy() out_fname = st.dev_clas_fns[0] timbl_out = parse_timbl_output(open(out_fname)) weight_corpus(corpus_inst, timbl_out, entropy_weight) # check that at least one weight is different (i.e. not 0.0) self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
def merge_files(inst_fns, true_fns, pred_fns, merger=Merger(), descriptor=None, n=None, binary=False): """ Merge corpus instance files @param inst_fns: list of corpus instance filenames @param true_fns: list of corpus filenames containing the true alignments @param pred_fns: list of predicted corpus filenames to be created @param merger: instance of Merger class for merging instances into a graph pair @keyword descriptor: a Descriptor instance, required if corpus instances are loaded in text format @keyword n: limit merging to the first n files @keyword binary: corpus instances in binary rather than text format """ assert isinstance(merger, Merger) assert len(inst_fns) == len(true_fns) > 0 for inst_fname, true_fname, pred_fname in zip(inst_fns, true_fns, pred_fns)[:n]: corpus_inst = CorpusInst() if binary: corpus_inst.loadbin(inst_fname) else: corpus_inst.loadtxt(inst_fname, descriptor.dtype) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) pred_corpus = merge_corpus(corpus_inst, true_corpus, merger) log.info("saving predictd corpus {0}".format(inst_fname)) pred_corpus.write(pred_fname)
def update_feats(features, org_inst_fns, org_descriptor, update_inst_fns, update_descriptor): """ A quick hack to facilitate second stage classification. Updates the values of specified features in original corpus instances with those from the update corpus instances. Modifies orginal files! @param features: tuple of Feat instances, which provides the names of the features to update @param org_inst_fns: list of orginal corpus instance filenames @param org_descriptor: feature description for original instances @param update_inst_fns: list of corpus instance filenames containing the updated features @param update_descriptor: feature description for update instances """ assert len(org_inst_fns) == len(update_inst_fns) for org_fn, up_fn in zip(org_inst_fns, update_inst_fns): print "Updating %s with %s" % (org_fn, up_fn) org_corpus_inst = CorpusInst() org_corpus_inst.loadtxt(org_fn, org_descriptor.dtype) up_corpus_inst = CorpusInst() up_corpus_inst.loadtxt(up_fn, update_descriptor.dtype) assert len(org_corpus_inst) == len(up_corpus_inst) for org_inst, up_inst in zip(org_corpus_inst, up_corpus_inst): for feat in features: org_inst[feat.name] = up_inst[feat.name] org_corpus_inst.savetxt(org_fn)
def match_files(inst_fns, matcher, descriptor=None, n=None, binary=False): """ Match corpus instances files @param inst_fns: list of corpus instance filenames @param matcher: a Matcher instance for matching source to target instances @keyword descriptor: a Descriptor instance, required if corpus instances are loaded in text format @keyword binary: corpus instances in binary rather than text format @keyword n: limit matching to the first n files """ for inst_fname in inst_fns[:n]: corpus_inst = CorpusInst() if binary: corpus_inst.loadbin(inst_fname) else: corpus_inst.loadtxt(inst_fname, descriptor.dtype) match_corpus(corpus_inst, matcher) log.info("saving matched corpus instances to {0}".format(inst_fname)) corpus_inst.save()
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[:setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)