def extract(setting): """ Extract features from corpus files, producing instance files and true corpus files. @param setting: Setting instance specifying the experimental setting """ if setting.extract: log.info("\n" + header("EXTRACT STEP")) makedirs(setting.inst_dir) makedirs(setting.true_dir) if setting.develop: inst_fns = setting.make_inst_fns(setting.dev_part_fns) true_fns = setting.make_true_fns(setting.dev_part_fns) extract_files( setting.extractor, setting.graph_selector, setting.dev_part_fns, inst_fns, true_fns, binary=setting.binary) if setting.validate: inst_fns = setting.make_inst_fns(setting.val_part_fns) true_fns = setting.make_true_fns(setting.val_part_fns) extract_files( setting.extractor, setting.graph_selector, setting.val_part_fns, inst_fns, true_fns, binary=setting.binary)
def evaluate(setting): """ Evaluate development data @param setting: Setting instance specifying the experimental setting """ if setting.evaluate: log.info("\n" + header("EVALUATE STEP")) makedirs(setting.eval_dir) if setting.develop: setting.dev_eval = eval_files( setting.dev_true_fns, setting.dev_pred_fns, setting.dev_eval_fname, align_eval=setting.evaluator, n=setting.n) if setting.validate: setting.val_eval = eval_files( setting.val_true_fns, setting.val_pred_fns, setting.val_eval_fname, align_eval=setting.evaluator, n=setting.n)
def extract(setting): """ Extract features from corpus files, producing instance files and true corpus files. @param setting: Setting instance specifying the experimental setting """ if setting.extract: log.info("\n" + header("EXTRACT STEP")) makedirs(setting.inst_dir) makedirs(setting.true_dir) if setting.develop: inst_fns = setting.make_inst_fns(setting.dev_part_fns) true_fns = setting.make_true_fns(setting.dev_part_fns) extract_files(setting.extractor, setting.graph_selector, setting.dev_part_fns, inst_fns, true_fns, binary=setting.binary) if setting.validate: inst_fns = setting.make_inst_fns(setting.val_part_fns) true_fns = setting.make_true_fns(setting.val_part_fns) extract_files(setting.extractor, setting.graph_selector, setting.val_part_fns, inst_fns, true_fns, binary=setting.binary)
def classify_file(train_inst_fns, test_inst_fns, out_fns=None, log_fn=None, clas_dir=None, descriptor=None, timbl=None, options="", log=False): """ Classify instances using Timbl @param train_inst_fns: a list of instance filenames for training @param test_inst_fns: a list of instance filenames for testing @keyword out_fns: list of classifier output files to be created @keyword log_fn: classifier log file to be created; ignored if keyword log is false @keyword clas_dir: directory for creating classifier output files; ignored if out_fns is given @keyword descriptor: a Descriptor instance, required to infer the feature metrics for Timbl, unless a TimblFile is supplied; ignored if timbl is supplied @keyword timbl: a tailored TimblFile instance; notice that it must at least set the verbosity options +vo, +vdb, +vdi, and the -m option to specify that the administrative features must be ignored. @keyword options: list of additional Timbl options, excluding -f, -m, +vo, +vdb, +vdi @keyword log: log Timbl's standard output and error streams to file @return: list of Timbl output filenames """ if clas_dir: makedirs(clas_dir) if not timbl: assert descriptor timbl = TimblFile(default_opts=timbl_options_string(descriptor)) else: # ignore descriptor assert isinstance(timbl, TimblFile) assert "+vo" in timbl.default_opts assert "+vdb" in timbl.default_opts assert "+vdi" in timbl.default_opts assert "-m" in timbl.default_opts return timbl.train_test_multi(train_inst_fns, test_inst_fns, out_fns=out_fns, log_fn=log_fn, options=options, log=log, out_dir=clas_dir)
def merge(setting): """ Merge data @param setting: Setting instance specifying the experimental setting """ if setting.merge: log.info("\n" + header("MERGE STEP")) makedirs(setting.pred_dir) if setting.develop: pred_fns = setting.make_pred_fns(setting.dev_true_fns) merge_files( setting.dev_inst_fns, setting.dev_true_fns, pred_fns, merger=setting.merger, descriptor=setting.descriptor, n=setting.n, binary=setting.binary) if setting.validate: pred_fns = setting.make_pred_fns(setting.val_true_fns) merge_files( setting.val_inst_fns, setting.val_true_fns, pred_fns, merger=setting.merger, descriptor=setting.descriptor, n=setting.n, binary=setting.binary)
def create_part_files(parts, base_dir=os.getenv("DAESO_CORPUS"), part_dir=None, max_size=None): """ Create the parallel graph corpora constituting the parts for training and testing @param parts: a dictionary where each key specifies the filename for the part and each value a sequence of parallel graph corpora filenames merged into the part @keyword base_dir: filename paths of the original corpus files must be relative to base_dir @keyword part_dir: the destination directory for the parts, which will be created if it does not exist. @keyword max_size: limits the maximal number of corpus files per part, which is sometimes useful for try-out experiments with only a small number of corpus files. @return: a list of part filenames created Note that the created parts cannot be moved, because they depend on the graph bank files of the original pgc files from which they were derived. """ if not part_dir: part_dir = os.getcwd() else: makedirs(part_dir) part_fnames = [] for part_name, corpus_fnames in sorted(parts.items()): part_fname = os.path.join(part_dir, part_name) corpus_fnames = [ os.path.join(base_dir, fname) for fname in corpus_fnames ] corpus = join_pgc(corpus_fnames[:max_size]) # graphbank file paths by default become relative to the new pgc file log.info("saving part file {0}".format(part_fname)) corpus.write(part_fname, pprint=True) part_fnames.append(part_fname) return part_fnames
def sample(setting): """ Sample training data @param setting: Setting instance specifying the experimental setting """ if setting.sample: log.info("\n" + header("SAMPLE STEP")) makedirs(setting.samp_dir) if setting.develop: samp_fns = setting.make_samp_fns(setting.dev_inst_fns) sample_file(setting.class_fracts, setting.dev_inst_fns, samp_fns) if setting.validate: samp_fns = setting.make_samp_fns(setting.val_inst_fns) sample_file(setting.class_fracts, setting.val_inst_fns, samp_fns)
def eval_corpora(true_corpora, pred_corpora, names, eval_fname, align_eval=None, n=None): """ Evaluate predicted against true parallel graph corpora. @param true_fns: iterable of true corpora @param pred_fns: iterable of predicted corpora @param names: iterable of labels for true/predicted pairs @param eval_fname: name of file to which evaluation output is written @keyword align_eval: AlignEval instance @keyword n: limit evaluation to the first n files """ if align_eval: assert isinstance(align_eval, AlignEval) # reset evaluator to prevent accidents align_eval.__init__() else: align_eval = AlignEval() count = 0 for true_corpus, pred_corpus, name in itertools.izip(true_corpora, pred_corpora, names): align_eval.add(true_corpus, pred_corpus, name) count += 1 if count == n: break align_eval.run_eval() log.info("saving evaluation report {0}".format(eval_fname)) makedirs(os.path.dirname(eval_fname)) align_eval.write(eval_fname) return align_eval
def classify(setting): """ Classify corpus instances @param setting: Setting instance specifying the experimental setting """ if setting.classify: log.info("\n" + header("CLASSIFY STEP")) makedirs(setting.clas_dir) if setting.train_sample: train_inst_fns = setting.dev_samp_fns else: train_inst_fns = setting.dev_inst_fns if setting.develop: classify_file_cv( train_inst_fns, test_inst_fns=setting.dev_inst_fns, out_fns=setting.make_out_fns(setting.dev_inst_fns), log_fns=setting.make_log_fns(setting.dev_inst_fns), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, n=setting.n, log=setting.timbl_log) if setting.validate: classify_file(train_inst_fns, setting.val_inst_fns, out_fns=setting.make_out_fns(setting.val_inst_fns), log_fn=setting.make_log_fname( setting.val_inst_fns[0]), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, log=setting.timbl_log)
def classify(setting): """ Classify corpus instances @param setting: Setting instance specifying the experimental setting """ if setting.classify: log.info("\n" + header("CLASSIFY STEP")) makedirs(setting.clas_dir) if setting.train_sample: train_inst_fns = setting.dev_samp_fns else: train_inst_fns = setting.dev_inst_fns if setting.develop: classify_file_cv( train_inst_fns, test_inst_fns=setting.dev_inst_fns, out_fns=setting.make_out_fns(setting.dev_inst_fns), log_fns=setting.make_log_fns(setting.dev_inst_fns), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, n=setting.n, log=setting.timbl_log) if setting.validate: classify_file( train_inst_fns, setting.val_inst_fns, out_fns=setting.make_out_fns(setting.val_inst_fns), log_fn=setting.make_log_fname(setting.val_inst_fns[0]), descriptor=setting.descriptor, timbl=setting.classifier, options=setting.timbl_opts, log=setting.timbl_log)
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)
def classify_file_cv(inst_fns, test_inst_fns=None, out_fns=None, log_fns=None, clas_dir=None, descriptor=None, timbl=None, options="", n=None, log=False): """ Classify instance using Timbl in a cross-validation procedure. @param inst_fns: a list of instance filenames for training; if no test_inst_fns is supplied, the same files will be used for testing, otherwise they are used for training only @keyword test_inst_fns: a list of instance filenames for testing; this allows for down-sampling of the training instances without affecting the test instances @keyword out_fns: list of classifier output files to be created @keyword log_fns: list of classifier log files to be created; ignored if keyword log is false @keyword clas_dir: directory for creating classifier output files; ignored if out_fns is given @keyword descriptor: a Descriptor instance, required to infer the feature metrics for Timbl, unless a TimblFile is supplied; ignored if timbl is supplied @keyword timbl: a tailored TimblFile instance; notice that it must at least set the verbosity options +vo, +vdb, +vdi, and the -m option to specify that the administrative features must be ignored. @keyword options: list of additional Timbl options, excluding -f, -m, +vo, +vdb, +vdi @keyword n: limit on the number of cross-validations performed (by default equals the number of instance filenames) @keyword log: log Timbl's standard output and error streams to file @return: list of Timbl output filenames """ if clas_dir: makedirs(clas_dir) if not timbl: assert descriptor timbl = TimblFile(default_opts=timbl_options_string(descriptor)) else: # ignore descriptor assert isinstance(timbl, TimblFile) assert "+vo" in timbl.default_opts assert "+vdb" in timbl.default_opts assert "+vdi" in timbl.default_opts assert "-m" in timbl.default_opts return timbl.cross_validate(inst_fns, test_inst_fns=test_inst_fns, out_fns=out_fns, log_fns=log_fns, options=options, n=n, log=log, out_dir=clas_dir)
def pickle(setting): if setting.pickle: makedirs(setting.pickle_dir) log.info("saving pickled setting {0}".format(setting.pickle_fname)) pkl_file = open(setting.pickle_fname, "wb") dump(setting, pkl_file)
def exp_dev_fast(setting): """ perform a fast alignment experiment on development data Weighting, matching and merging takes place per test corpus without writing intermediary results to a file. """ assert setting.develop and not setting.validate exp_init(setting) create_parts(setting) # It's impossible to do extraction one corpus a time, because in order to # classify a test corpus you need instances for all the other training # corpora! Moreover, since Timbl classification is file-based, we need to # write the corpus instance files to disk. These files can be huge and # keeping all of them in memory seems to offer little benefit. extract(setting) sample(setting) # Timbl writes its output to a file, which then needs to be parsed in # order to insert the class predictions and weights into the corpus # instances. That means there is no advantage to doing classification # one corpus a time. classify(setting) log.info("\n" + header("WEIGHT/MATCH/MERGE STEP")) # reset evaluator if setting.evaluate: setting.evaluator.__init__() scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[:setting.n] for inst_fname, out_fname, true_fname in scope: log.info("reading corpus instances {0}".format(inst_fname)) corpus_inst = CorpusInst() corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype) if setting.weight: log.info("reading classifier output {0}".format(out_fname)) timbl_out = parse_timbl_output(open(out_fname)) log.info("weighting...") weight_corpus(corpus_inst, timbl_out, setting.weight_func) if setting.match: log.info("matching...") match_corpus(corpus_inst, setting.matcher) if setting.merge: log.info("reading true corpus {0}".format(true_fname)) true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) log.info("merging...") pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger) if setting.evaluate: name = os.path.basename(true_fname).split("_")[0] setting.evaluator.add(true_corpus, pred_corpus, name) if setting.evaluate: log.info("evaluting...") setting.evaluator.run_eval() log.info("saving evaluation {0}".format(setting.dev_eval_fname)) makedirs(setting.eval_dir) setting.evaluator.write(setting.dev_eval_fname) exp_exit(setting)