def exp(setting): """ perform alignment experiment """ exp_init(setting) create_parts(setting) extract(setting) sample(setting) classify(setting) weight(setting) match(setting) merge(setting) evaluate(setting) exp_exit(setting)
def create_sample_parts(setting): """ Create the parts in "./parts" subdir. This assumes a sample corpus under the"corpus" subdir and corresponding partitioning in ./partition.py part_max_size is set to 1 (only 1 pgc file per part) to keep testing fast Produces part/dev001.pgc, ..., part/dev004.pgc and part/val001.pgc """ # import partition.py here, because if it is imported at the module level, # we cannot use create_partition to create a new one import partition setting.part = True setting.dev_parts=partition.dev_parts setting.val_parts=partition.val_parts setting.part_max_size = 1 create_parts(setting)
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[:setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)