def weight_files(inst_fns, out_fns, weight_func=entropy_weight, descriptor=None, n=None, binary=False): """ Weight corpus instance files @param inst_fns: list of corpus instance filenames @param out_fns: list of filenames containing Timbl output @keyword weight_func: weighting fuction @keyword descriptor: a Descriptor instance, required if corpus instances are loaded in text format @keyword n: limit merging to the first n files """ for inst_fname, out_fname in zip(inst_fns, out_fns)[:n]: corpus_inst = CorpusInst() if binary: corpus_inst.loadbin(inst_fname) else: corpus_inst.loadtxt(inst_fname, descriptor.dtype) timbl_out = parse_timbl_output(open(out_fname)) weight_corpus(corpus_inst, timbl_out, weight_func) log.info("saving weighted corpus instances to {0}".format(inst_fname)) corpus_inst.save()
def match_files(inst_fns, matcher, descriptor=None, n=None, binary=False): """ Match corpus instances files @param inst_fns: list of corpus instance filenames @param matcher: a Matcher instance for matching source to target instances @keyword descriptor: a Descriptor instance, required if corpus instances are loaded in text format @keyword binary: corpus instances in binary rather than text format @keyword n: limit matching to the first n files """ for inst_fname in inst_fns[:n]: corpus_inst = CorpusInst() if binary: corpus_inst.loadbin(inst_fname) else: corpus_inst.loadtxt(inst_fname, descriptor.dtype) match_corpus(corpus_inst, matcher) log.info("saving matched corpus instances to {0}".format(inst_fname)) corpus_inst.save()