def _create_mallet_file(self, corpus=True): if corpus: fnames = filename_generator(self.input_dataset.path, self.file_list) else: fnames = [f.strip() for f in open(self.file_list).readlines()] fnames = [f for f in fnames if f.strip()] fh = open_output_file(self.mallet_file, compress=False) self._set_features() if VERBOSE: print "[create_mallet_file] features: %s" % (self.features) features = dict([(f, True) for f in self.features]) stats = {'labeled_count': 0, 'unlabeled_count': 0, 'total_count': 0} count = 0 for fname in fnames: count += 1 if VERBOSE: print "[create_mallet_file] %05d %s" % (count, fname) train.add_file_to_utraining_test_file( fname, fh, {}, features, stats, use_all_chunks_p=self.use_all_chunks_p) fh.close() if VERBOSE: print "[create_mallet_file]", stats
def _build_model(self): """Build the classifier model using the doc features files.""" fnames = filename_generator(self.input_dataset.path, self.file_list) train.patent_utraining_data3(self.mallet_file, self.annotation_file, self.annotation_count, fnames, self.features, self.model, self.xval, VERBOSE, self.info_file_stats)
def collect_contexts(dataset_tags, dataset_feats, filelist): """Collect all contexts from the dataset and return them as a dictionary indexed on terms. Each value is a list of [year, id, context] triples.""" tag_files = list(filename_generator(dataset_tags.path, filelist)) feat_files = list(filename_generator(dataset_feats.path, filelist)) contexts = {} for i in range(len(tag_files)): if verbose: print '[collect_contexts]', tag_files[i] print '[collect_contexts]', feat_files[i] fd = FileData(tag_files[i], feat_files[i]) for term in fd.get_terms(): term_obj = fd.get_term(term) for instance in term_obj.term_instances: contexts.setdefault(term, []).append(instance) return contexts
def annotate_something(dirname, rconfig, filelist, chunks): """This is a stub method that explains a bit more on how to create annotation files. Includes scaffolding that shows how to pull information out of phrase feature and tag files. This is for cases when you use a list of files.""" # Here is how you get the datasets dataset_tags = find_input_dataset(rconfig, 'd2_tag') dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats') # Check whether files from the file list are available check_file_availability(dataset_tags, filelist) check_file_availability(dataset_feats, filelist) # Next would typically be some way of writing down the information, the # following writes general information (command used, corpus directory as # well as git commit) and the list of files used. This also creates the # output directory. write_info(rconfig, dirname, filelist) # Now we can get the file names, loop over them, and extract the needed # information. The code below is some scaffolding if all you need is in one # dataset. fnames = filename_generator(dataset_feats.path, filelist) for fname in fnames: with open_input_file(fname) as fh: # extract data from the line, you may want to put it in some # temporary data structure for line in fh: pass # And this is what you do if you need information that is distributed over # the feature and tag files. tag_files = list(filename_generator(dataset_tags.path, filelist)) feat_files = list(filename_generator(dataset_feats.path, filelist)) for i in range(len(tag_files)): # the FileData object fd = FileData(tag_files[i], feat_files[i]) # all term-related stuff lives in the Term object and its term_instances # variable, you can print to the annotation file(s) from here or first # build some intermediate data structure and then print the output later for term in fd.get_terms(): term_obj = fd.get_term(term)
def _create_mallet_file(self): self._load_phrase_labels() mconfig = mallet.MalletConfig( self.model, 'train', 'classify', '0', self.model, '/tmp', classifier_type="MaxEnt", number_xval=0, training_portion=0, prune_p=False, infogain_pruning="5000", count_pruning="3") mtr = mallet.MalletTraining(mconfig) fnames = filename_generator(self.input_dataset.path, self.file_list) mtr.make_utraining_file3(fnames, self.d_phr2label, verbose=VERBOSE) self._create_info_stats_file(mtr.stats_labeled_count, mtr.stats_unlabeled_count, mtr.stats_terms, mtr.stats_terms_y, mtr.stats_terms_n)
def run(self): self.time = time.time() self._find_datasets() self._create_info_files() fnames = filename_generator(self.input_dataset.path, self.file_list) count = 0 fh = codecs.open(self.locations_file, 'w', encoding='utf-8') for fname in fnames: count += 1 #if count > 5: break print_file_progress("Collector", count, fname, VERBOSE) self._process_file(fname, fh) self._finish()
def annotate_inventions(dirname, rconfig, filelist, chunks): """Create a directory with annotation files in t0_annotation/<name>.""" dataset_tags = find_input_dataset(rconfig, 'd2_tag') dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats') check_file_availability(dataset_tags, filelist) check_file_availability(dataset_feats, filelist) write_info(rconfig, dirname, filelist) outfile = os.path.join(dirname, 'annotate.inventions.unlab.txt') output_fh = codecs.open(outfile, 'w', encoding='utf-8') tag_files = list(filename_generator(dataset_tags.path, filelist)) feat_files = list(filename_generator(dataset_feats.path, filelist)) # add the content of the general info file as a preface with open(os.path.join(dirname, 'annotate.info.general.txt')) as fh: for line in fh: output_fh.write("# %s\n" % line.rstrip()) output_fh.write("#\n") for i in range(len(tag_files)): fd = FileData(tag_files[i], feat_files[i]) _add_file_data_to_annotation_file(output_fh, fd)
def check_file_availability(dataset, filelist): """Check whether all files in filelist are available in dataset. If not, print a warning and exit. This method allows for possibility that the file was compressed.""" file_generator = filename_generator(dataset.path, filelist) total = 0 not_in_dataset = 0 for fname in file_generator: total += 1 if not os.path.exists(fname) and not os.path.exists(fname + '.gz'): not_in_dataset += 1 if not_in_dataset > 0: sys.exit( "WARNING: %d/%d files in %s have not been processed yet\n %s" % (not_in_dataset, total, os.path.basename(filelist), dataset))
def collect_counts(dataset, filelist): """Return a dictionary with for each term the number of documents it appeared in. This assumes that the dataset is a d3_phr_feats dataset.""" counts = {} fnames = filename_generator(dataset.path, filelist) for fname in fnames: if verbose: print '[collect_counts]', fname # TODO: this is dangerous because it makes assumptions about the # directory structure, something similar was the case in step2 for at # least the docfeats generation year = os.path.basename(os.path.dirname(fname)) doc_id = os.path.basename(fname) with open_input_file(fname) as fh: docfeats = generate_doc_feats(fh, doc_id, year) for term in docfeats.keys(): counts[term] = counts.get(term, 0) + 1 return counts
def _create_mallet_file(self): print "[--classify] creating vector file - %s" % os.path.basename( self.mallet_file) count = 0 d_phr2label = train.load_phrase_labels3(self.label_file) fh = open_output_file(self.mallet_file, compress=False) stats = {'labeled_count': 0, 'unlabeled_count': 0, 'total_count': 0} fnames = filename_generator(self.input_dataset.path, self.file_list) for phr_feats_file in fnames: count += 1 if VERBOSE: print "[--classify] %05d %s" % (count, phr_feats_file) train.add_file_to_utraining_test_file( phr_feats_file, fh, d_phr2label, self.d_features, stats, use_all_chunks_p=self.use_all_chunks_p) fh.close() print "[--classify]", stats
def run(self): self.time = time.time() if self.rconfig.corpus is not None: self._find_datasets() fnames = filename_generator(self.input_dataset.path, self.file_list) print fnames.next() else: fnames = [f.strip() for f in open(self.file_list).readlines()] print 11, fnames[0], 22 self._create_info_files() with codecs.open(self.results_file1, 'w', encoding='utf-8') as fh: print "[--matcher] applying patterns to files" count = 0 for fname in fnames: count += 1 print_file_progress("Matcher", count, fname, VERBOSE) # if count > 10: break self.run_matcher_on_file(fname, fh) self.create_summary() self.feature_statistics.write_to_file() self._finish()