def annotate_technologies(dirname, rconfig, filelist, sort_terms_p, print_context_p): """Create input for manually annotation (that is, creation of a labeled list of terms). Given a runtime configuration for a corpus and a file list, creates three files: (1) list of unlabeled terms, ordered on frequency, (2) list of ordered terms with frequency information, and (3) an ordered list of terms with contexts. For contexts, a maximum of 10 is printed, selected randomly.""" print "finding tags..." dataset_tags = find_input_dataset(rconfig, 'd2_tag') print "finding features..." dataset_features = find_input_dataset(rconfig, 'd3_phr_feats') if verbose: print "\nFILELIST:", filelist print "DATASET TAGS: ", dataset_tags print "DATASET FEATS:", dataset_features, "\n" check_file_availability(dataset_tags, filelist) check_file_availability(dataset_features, filelist) write_info(rconfig, dirname, filelist) term_contexts = {} if print_context_p: term_contexts = collect_contexts(dataset_tags, dataset_features, filelist) term_counts = collect_counts(dataset_features, filelist) term_count_list = sorted(list(term_counts.items())) term_count_list.sort(lambda x, y: cmp(y[1], x[1])) print_annotation_files(dirname, term_count_list, term_contexts, sort_terms_p, print_context_p)
def _find_datasets(self): """Select data sets and check whether all files are available.""" # TODO: this is the same as the method on TrainerClassifier print "[Collector] finding dataset and checking file availability" self.input_dataset = find_input_dataset(self.rconfig, 'd3_phr_feats') print "[Collector]", self.input_dataset check_file_availability(self.input_dataset, self.file_list)
def annotate_something(dirname, rconfig, filelist, chunks): """This is a stub method that explains a bit more on how to create annotation files. Includes scaffolding that shows how to pull information out of phrase feature and tag files. This is for cases when you use a list of files.""" # Here is how you get the datasets dataset_tags = find_input_dataset(rconfig, 'd2_tag') dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats') # Check whether files from the file list are available check_file_availability(dataset_tags, filelist) check_file_availability(dataset_feats, filelist) # Next would typically be some way of writing down the information, the # following writes general information (command used, corpus directory as # well as git commit) and the list of files used. This also creates the # output directory. write_info(rconfig, dirname, filelist) # Now we can get the file names, loop over them, and extract the needed # information. The code below is some scaffolding if all you need is in one # dataset. fnames = filename_generator(dataset_feats.path, filelist) for fname in fnames: with open_input_file(fname) as fh: # extract data from the line, you may want to put it in some # temporary data structure for line in fh: pass # And this is what you do if you need information that is distributed over # the feature and tag files. tag_files = list(filename_generator(dataset_tags.path, filelist)) feat_files = list(filename_generator(dataset_feats.path, filelist)) for i in range(len(tag_files)): # the FileData object fd = FileData(tag_files[i], feat_files[i]) # all term-related stuff lives in the Term object and its term_instances # variable, you can print to the annotation file(s) from here or first # build some intermediate data structure and then print the output later for term in fd.get_terms(): term_obj = fd.get_term(term)
def annotate_terms(dirname, rconfig, instances_file): """Create an annotation file for term instances.""" # Get the datasets dataset_tags = find_input_dataset(rconfig, 'd2_tag') dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats') print dataset_feats.path print dataset_tags.path # Create the directory where the files will be written to; write info; and # open the output file, intializing it with info write_info(rconfig, dirname, instances_file) outfile = os.path.join(dirname, 'annotate.terms.context.txt') out = codecs.open(outfile, 'w', encoding='utf8') # add the content of the general info file as a preface with open(os.path.join(dirname, 'annotate.info.general.txt')) as fh: for line in fh: out.write("# %s\n" % line.rstrip()) out.write("#\n") # time to get the terms terms = _read_terms(instances_file) _reduce_terms(terms) #_print_terms(terms) for term, locations in terms.items(): count = 0 for doc, lines in locations: #sys.stdout.write("%s %s %s" % (term, doc, lines)) print term, doc, lines phr_file = os.path.join(dataset_feats.path, 'files', doc) + '.xml' tag_file = os.path.join(dataset_tags.path, 'files', doc) + '.xml' fd = FileData(tag_file, phr_file) term_obj = fd.get_term(term) # this helps just getting the first instance in a line done = {} for inst in term_obj.term_instances: if inst.doc_loc in lines and inst.doc_loc not in done: done[inst.doc_loc] = True count += 1 out.write("%s - %d\n" % (term, count)) inst.print_as_tabbed_line(out)
def annotate_inventions(dirname, rconfig, filelist, chunks): """Create a directory with annotation files in t0_annotation/<name>.""" dataset_tags = find_input_dataset(rconfig, 'd2_tag') dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats') check_file_availability(dataset_tags, filelist) check_file_availability(dataset_feats, filelist) write_info(rconfig, dirname, filelist) outfile = os.path.join(dirname, 'annotate.inventions.unlab.txt') output_fh = codecs.open(outfile, 'w', encoding='utf-8') tag_files = list(filename_generator(dataset_tags.path, filelist)) feat_files = list(filename_generator(dataset_feats.path, filelist)) # add the content of the general info file as a preface with open(os.path.join(dirname, 'annotate.info.general.txt')) as fh: for line in fh: output_fh.write("# %s\n" % line.rstrip()) output_fh.write("#\n") for i in range(len(tag_files)): fd = FileData(tag_files[i], feat_files[i]) _add_file_data_to_annotation_file(output_fh, fd)
def _find_datasets(self): """Select data sets and check whether all files are available.""" self.input_dataset = find_input_dataset(self.rconfig, 'd3_phr_feats') check_file_availability(self.input_dataset, self.file_list)