Пример #1
0
def annotate_technologies(dirname, rconfig, filelist, sort_terms_p,
                          print_context_p):
    """Create input for manually annotation (that is, creation of a labeled list
    of terms). Given a runtime configuration for a corpus and a file list,
    creates three files: (1) list of unlabeled terms, ordered on frequency, (2)
    list of ordered terms with frequency information, and (3) an ordered list of
    terms with contexts. For contexts, a maximum of 10 is printed, selected
    randomly."""

    print "finding tags..."
    dataset_tags = find_input_dataset(rconfig, 'd2_tag')
    print "finding features..."
    dataset_features = find_input_dataset(rconfig, 'd3_phr_feats')

    if verbose:
        print "\nFILELIST:", filelist
        print "DATASET TAGS: ", dataset_tags
        print "DATASET FEATS:", dataset_features, "\n"

    check_file_availability(dataset_tags, filelist)
    check_file_availability(dataset_features, filelist)

    write_info(rconfig, dirname, filelist)
    term_contexts = {}
    if print_context_p:
        term_contexts = collect_contexts(dataset_tags, dataset_features,
                                         filelist)
    term_counts = collect_counts(dataset_features, filelist)
    term_count_list = sorted(list(term_counts.items()))
    term_count_list.sort(lambda x, y: cmp(y[1], x[1]))
    print_annotation_files(dirname, term_count_list, term_contexts,
                           sort_terms_p, print_context_p)
Пример #2
0
 def _find_datasets(self):
     """Select data sets and check whether all files are available."""
     # TODO: this is the same as the method on TrainerClassifier
     print "[Collector] finding dataset and checking file availability"
     self.input_dataset = find_input_dataset(self.rconfig, 'd3_phr_feats')
     print "[Collector]", self.input_dataset
     check_file_availability(self.input_dataset, self.file_list)
Пример #3
0
def annotate_something(dirname, rconfig, filelist, chunks):
    """This is a stub method that explains a bit more on how to create
    annotation files. Includes scaffolding that shows how to pull information
    out of phrase feature and tag files. This is for cases when you use a list
    of files."""

    # Here is how you get the datasets
    dataset_tags = find_input_dataset(rconfig, 'd2_tag')
    dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats')

    # Check whether files from the file list are available
    check_file_availability(dataset_tags, filelist)
    check_file_availability(dataset_feats, filelist)

    # Next would typically be some way of writing down the information, the
    # following writes general information (command used, corpus directory as
    # well as git commit) and the list of files used. This also creates the
    # output directory.
    write_info(rconfig, dirname, filelist)

    # Now we can get the file names, loop over them, and extract the needed
    # information. The code below is some scaffolding if all you need is in one
    # dataset.
    fnames = filename_generator(dataset_feats.path, filelist)
    for fname in fnames:
        with open_input_file(fname) as fh:
            # extract data from the line, you may want to put it in some
            # temporary data structure
            for line in fh:
                pass

    # And this is what you do if you need information that is distributed over
    # the feature and tag files.
    tag_files = list(filename_generator(dataset_tags.path, filelist))
    feat_files = list(filename_generator(dataset_feats.path, filelist))
    for i in range(len(tag_files)):
        # the FileData object
        fd = FileData(tag_files[i], feat_files[i])
        # all term-related stuff lives in the Term object and its term_instances
        # variable, you can print to the annotation file(s) from here or first
        # build some intermediate data structure and then print the output later
        for term in fd.get_terms():
            term_obj = fd.get_term(term)
Пример #4
0
def annotate_terms(dirname, rconfig, instances_file):
    """Create an annotation file for term instances."""

    # Get the datasets
    dataset_tags = find_input_dataset(rconfig, 'd2_tag')
    dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats')
    print dataset_feats.path
    print dataset_tags.path

    # Create the directory where the files will be written to; write info; and
    # open the output file, intializing it with info
    write_info(rconfig, dirname, instances_file)
    outfile = os.path.join(dirname, 'annotate.terms.context.txt')
    out = codecs.open(outfile, 'w', encoding='utf8')
    # add the content of the general info file as a preface
    with open(os.path.join(dirname, 'annotate.info.general.txt')) as fh:
        for line in fh:
            out.write("# %s\n" % line.rstrip())
        out.write("#\n")

    # time to get the terms
    terms = _read_terms(instances_file)
    _reduce_terms(terms)
    #_print_terms(terms)

    for term, locations in terms.items():
        count = 0
        for doc, lines in locations:
            #sys.stdout.write("%s %s %s" % (term, doc, lines))
            print term, doc, lines
            phr_file = os.path.join(dataset_feats.path, 'files', doc) + '.xml'
            tag_file = os.path.join(dataset_tags.path, 'files', doc) + '.xml'
            fd = FileData(tag_file, phr_file)
            term_obj = fd.get_term(term)
            # this helps just getting the first instance in a line
            done = {}
            for inst in term_obj.term_instances:
                if inst.doc_loc in lines and inst.doc_loc not in done:
                    done[inst.doc_loc] = True
                    count += 1
                    out.write("%s - %d\n" % (term, count))
                    inst.print_as_tabbed_line(out)
Пример #5
0
def annotate_inventions(dirname, rconfig, filelist, chunks):
    """Create a directory with annotation files in t0_annotation/<name>."""

    dataset_tags = find_input_dataset(rconfig, 'd2_tag')
    dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats')
    check_file_availability(dataset_tags, filelist)
    check_file_availability(dataset_feats, filelist)

    write_info(rconfig, dirname, filelist)
    outfile = os.path.join(dirname, 'annotate.inventions.unlab.txt')
    output_fh = codecs.open(outfile, 'w', encoding='utf-8')
    tag_files = list(filename_generator(dataset_tags.path, filelist))
    feat_files = list(filename_generator(dataset_feats.path, filelist))

    # add the content of the general info file as a preface
    with open(os.path.join(dirname, 'annotate.info.general.txt')) as fh:
        for line in fh:
            output_fh.write("# %s\n" % line.rstrip())
        output_fh.write("#\n")

    for i in range(len(tag_files)):
        fd = FileData(tag_files[i], feat_files[i])
        _add_file_data_to_annotation_file(output_fh, fd)
 def _find_datasets(self):
     """Select data sets and check whether all files are available."""
     self.input_dataset = find_input_dataset(self.rconfig, 'd3_phr_feats')
     check_file_availability(self.input_dataset, self.file_list)