def run_utest(target_path, language, version, limit, classifier='MaxEnt', use_all_chunks_p=True): """Run the classifier on n=limit documents. Batch version of the function train.patent_utraining_test_data(). Appends results to test/utest.1.MaxEnt.out and keeps intermediate results for this invocation in test/utest.1.mallet.START-END (raw feature vectors) and test/utest.1.MaxEnt.out.BEGIN_END, where begin and end are taken from ALL_STAGES.txt and the limit parameter.""" # get dictionary of annotations and keep label stats (total_count == unlabeled_count # if use_all_chunks_p is False, otherwisetal_count == unlabeled_count + labeled_counts d_phr2label = train.load_phrase_labels(target_path, language) stats = { 'labeled_count': 0, 'unlabeled_count': 0, 'total_count': 0 } stages = read_stages(target_path, language) fnames = files_to_process(target_path, language, stages, '--utest', limit) (train_dir, test_dir, mallet_file, results_file, all_results_file) = \ _classifier_io(target_path, language, version, classifier, stages) print "[--utest] vector file - %s" % mallet_file print "[--utest] results file - %s" % results_file count = 0 fh = codecs.open(mallet_file, "a", encoding='utf-8') for (year, fname) in fnames: count += 1 doc_feats_file = os.path.join(target_path, language, 'doc_feats', year, fname) if verbose: print "%05d %s" % (count, doc_feats_file) train.add_file_to_utraining_test_file(doc_feats_file, fh, d_phr2label, stats, use_all_chunks_p=use_all_chunks_p) fh.close() _run_classifier(train_dir, test_dir, version, classifier, mallet_file, results_file) #_append_classifier_results(results_file, all_results_file) update_stages(target_path, language, '--utest', limit)
def find_technologies_batch(patterns, target_path, language, limit, verbose): """Similar to find_technologies() in that it finds pattern matches in a file and output results in a tab separated file. The difference is that this version runs in batch mode and will only do those parts of the file that fall within the limit.""" features_file = os.path.join(target_path, language, "ws", "phr_feats.all") matches_file = os.path.join(target_path, language, "ws", "matches.all") f1 = codecs.open(features_file, encoding='utf8') f2 = codecs.open(matches_file, 'a', encoding='utf8') stages = read_stages(target_path, language) begin = stages.get('--matcher', 0) end = begin + limit current_fname = None file_count = 0 for phrase in f1: if file_count > end: break (fname_id, year, term, rest) = phrase.strip("\n").split("\t", 3) (current_fname, file_count) = _update_state(fname_id, current_fname, file_count, begin, end, verbose) if file_count > begin: for p in patterns: matched_part = _match_pattern(p, phrase) if not matched_part is False: line = "%s\t%s\t%s\t%s\t%s\n" % (fname_id, year, p[0], term, matched_part) f2.write(line) update_stages(target_path, language, '--matcher', limit)
def run_tag2chk(target_path, language, limit, chunk_filter): """Runs the np-in-context code on tagged input. Populates language/phr_occ and language/phr_feat. Sets the contents of config-chunk-filter.txt given the value of chunk_filter.""" print "[--tag2chk] on %s/%s/tag/" % (target_path, language) filter_setting = "on" if chunk_filter else "off" _save_config(target_path, language, 'chunk-filter', filter_setting) #fh = open(os.path.join(target_path, language, 'config-chunk-filter.txt'), 'w') #filter_setting = "on" if chunk_filter else "off" #fh.write("chunk-filter %s\n" % filter_setting) #fh.close() stages = read_stages(target_path, language) fnames = files_to_process(target_path, language, stages, '--tag2chk', limit) count = 0 for (year, fname) in fnames: count += 1 tag_file = os.path.join(target_path, language, 'tag', year, fname) occ_file = os.path.join(target_path, language, 'phr_occ', year, fname) fea_file = os.path.join(target_path, language, 'phr_feats', year, fname) if verbose: print "[--tag2chk] %04d adding %s" % (count, occ_file) tag2chunk.Doc(tag_file, occ_file, fea_file, year, language, filter_p=chunk_filter) update_stages(target_path, language, '--tag2chk', limit)
def run_txt2tag(target_path, language, limit): """Takes txt files and runs the tagger (and segmenter for Chinese) on them. Adds files to the language/tag and language/seg directories. Works on pasiphae but not on chalciope.""" print "[--txt2tag] on %s/%s/txt/" % (target_path, language) stages = read_stages(target_path, language) tagger = txt2tag.get_tagger(language) segmenter = sdp.Segmenter() fnames = files_to_process(target_path, language, stages, '--txt2tag', limit) count = 0 for year, fname in fnames: count += 1 txt_file = os.path.join(target_path, language, 'txt', year, fname) seg_file = os.path.join(target_path, language, 'seg', year, fname) tag_file = os.path.join(target_path, language, 'tag', year, fname) if language == 'cn': if verbose: print "[--txt2tag] %04d creating %s" % (count, seg_file) cn_txt2seg.seg(txt_file, seg_file, segmenter) if verbose: print "[--txt2tag] %04d creating %s" % (count, tag_file) cn_seg2tag.tag(seg_file, tag_file, tagger) else: if verbose: print "[--txt2tag] %04d creating %s" % (count, tag_file) txt2tag.tag(txt_file, tag_file, tagger) update_stages(target_path, language, '--txt2tag', limit)
def run_xml2txt(target_path, language, limit): """Takes xml files and runs the document structure parser in onto mode. Adds files to the language/txt directory and ds_* directories with intermediate document structure parser results.""" print "[--xml2txt] on %s/%s/xml/" % (target_path, language) stages = read_stages(target_path, language) xml_parser = Parser() xml_parser.onto_mode = True mappings = {'en': 'ENGLISH', 'de': "GERMAN", 'cn': "CHINESE" } xml_parser.language = mappings[language] fnames = files_to_process(target_path, language, stages, '--xml2txt', limit) workspace = os.path.join(target_path, language, 'ws') count = 0 for year, fname in fnames: count += 1 source_file = os.path.join(target_path, language, 'xml', year, fname) target_file = os.path.join(target_path, language, 'txt', year, fname) if verbose: print "[--xml2txt] %04d creating %s" % (count, target_file) try: xml2txt.xml2txt(xml_parser, source_file, target_file, workspace) except Exception: fh = codecs.open(target_file, 'w') fh.close() print "[--xml2txt] WARNING: error on", source_file
def run_utrain(target_path, language, version, xval, limit): """Creates a mallet training file for labeled data with features as union of all phrase instances within a doc. Also creates a model utrain.<version>.MaxEnt.model in the train subdirectory. Limit is used to determine the size of the training set, as with run_annotate, it is not used for incrementing values in ALL_STAGES.txt. """ stages = read_stages(target_path, language) fnames = files_to_process(target_path, language, stages, '--utrain', limit) annot_path = config_data.annotation_directory source_annot_lang_file = os.path.join(annot_path, language, 'phr_occ.lab') target_annot_lang_file = os.path.join(target_path, language, 'ws', 'phr_occ.lab') shutil.copyfile(source_annot_lang_file, target_annot_lang_file) #train.patent_utraining_data(target_path, language, version, xval, limit) train.patent_utraining_data2(target_path, language, fnames, version, xval) update_stages(target_path, language, '--utrain', limit)
def run_pf2dfeats(target_path, language, limit): """Creates a union of the features for each chunk in a doc (for training).""" print "[--pf2dfeats] on %s/%s/phr_feats/" % (target_path, language) stages = read_stages(target_path, language) fnames = files_to_process(target_path, language, stages, '--pf2dfeats', limit) count = 0 for (year, fname) in fnames: count += 1 doc_id = os.path.splitext(os.path.basename(fname))[0] phr_file = os.path.join(target_path, language, 'phr_feats', year, fname) doc_file = os.path.join(target_path, language, 'doc_feats', year, fname) if verbose: print "[--pf2dfeats] %04d adding %s" % (count, doc_file) pf2dfeats.make_doc_feats(phr_file, doc_file, doc_id, year) update_stages(target_path, language, '--pf2dfeats', limit)
def run_populate(source_path, target_path, language, limit): """Populate xml directory in the target directory with limit files from the source path.""" print "[--populate] populating %s/%s/xml" % (target_path, language) print "[--populate] using %d files from %s" % (limit, source_path) stages = read_stages(target_path, language) fnames = files_to_process(target_path, language, stages, '--populate', limit) count = 0 for (year, fname) in fnames: count += 1 source_file = os.path.join(source_path, year, fname) target_file = os.path.join(target_path, language, 'xml', year, fname) if verbose: print "[--populate] %04d adding %s" % (count, target_file) shutil.copyfile(source_file, target_file) update_stages(target_path, language, '--populate', limit)
def run_summary(target_path, language, limit): """Collect data from directories into workspace area: ws/doc_feats.all, ws/phr_feats.all and ws/phr_occ.all. All downstream processing should rely on these data and nothing else.""" print "[--summary] appending to files in ws" #subprocess.call("sh ./cat_phr.sh %s %s" % (target_path, language), shell=True) stages = read_stages(target_path, language) fnames = files_to_process(target_path, language, stages, '--summary', limit) doc_feats_file = os.path.join(target_path, language, 'ws', 'doc_feats.all') phr_feats_file = os.path.join(target_path, language, 'ws', 'phr_feats.all') phr_occ_file = os.path.join(target_path, language, 'ws', 'phr_occ.all') fh_doc_feats = codecs.open(doc_feats_file, 'a', encoding='utf-8') fh_phr_feats = codecs.open(phr_feats_file, 'a', encoding='utf-8') fh_phr_occ = codecs.open(phr_occ_file, 'a', encoding='utf-8') for (year, fname) in fnames: doc_feats_file = os.path.join(target_path, language, 'doc_feats', year, fname) phr_feats_file = os.path.join(target_path, language, 'phr_feats', year, fname) phr_occ_file = os.path.join(target_path, language, 'phr_occ', year, fname) fh_doc_feats.write(codecs.open(doc_feats_file, encoding='utf-8').read()) fh_phr_feats.write(codecs.open(phr_feats_file, encoding='utf-8').read()) fh_phr_occ.write(codecs.open(phr_occ_file, encoding='utf-8').read()) update_stages(target_path, language, '--summary', limit)