def weight_files(inst_fns, out_fns, weight_func=entropy_weight, descriptor=None, n=None, binary=False): """ Weight corpus instance files @param inst_fns: list of corpus instance filenames @param out_fns: list of filenames containing Timbl output @keyword weight_func: weighting fuction @keyword descriptor: a Descriptor instance, required if corpus instances are loaded in text format @keyword n: limit merging to the first n files """ for inst_fname, out_fname in zip(inst_fns, out_fns)[:n]: corpus_inst = CorpusInst() if binary: corpus_inst.loadbin(inst_fname) else: corpus_inst.loadtxt(inst_fname, descriptor.dtype) timbl_out = parse_timbl_output(open(out_fname)) weight_corpus(corpus_inst, timbl_out, weight_func) log.info("saving weighted corpus instances to {0}".format(inst_fname)) corpus_inst.save()
def test_weight_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) graph_inst = corpus_inst[0] # clear predicted weights field graph_inst["pred_weight"] = 0.0 # backup original for comparison later on weight_before = graph_inst["pred_weight"].copy() out_fname = st.dev_clas_fns[0] timbl_out = parse_timbl_output(open(out_fname)) weight_corpus(corpus_inst, timbl_out, entropy_weight) # check that at least one weight is different (i.e. not 0.0) self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[:setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)