def _create_mallet_file(self, corpus=True):
     if corpus:
         fnames = filename_generator(self.input_dataset.path,
                                     self.file_list)
     else:
         fnames = [f.strip() for f in open(self.file_list).readlines()]
         fnames = [f for f in fnames if f.strip()]
     fh = open_output_file(self.mallet_file, compress=False)
     self._set_features()
     if VERBOSE:
         print "[create_mallet_file] features: %s" % (self.features)
     features = dict([(f, True) for f in self.features])
     stats = {'labeled_count': 0, 'unlabeled_count': 0, 'total_count': 0}
     count = 0
     for fname in fnames:
         count += 1
         if VERBOSE:
             print "[create_mallet_file] %05d %s" % (count, fname)
         train.add_file_to_utraining_test_file(
             fname,
             fh, {},
             features,
             stats,
             use_all_chunks_p=self.use_all_chunks_p)
     fh.close()
     if VERBOSE:
         print "[create_mallet_file]", stats
 def _build_model(self):
     """Build the classifier model using the doc features files."""
     fnames = filename_generator(self.input_dataset.path, self.file_list)
     train.patent_utraining_data3(self.mallet_file, self.annotation_file,
                                  self.annotation_count, fnames,
                                  self.features, self.model, self.xval,
                                  VERBOSE, self.info_file_stats)
Exemplo n.º 3
0
def collect_contexts(dataset_tags, dataset_feats, filelist):
    """Collect all contexts from the dataset and return them as a dictionary
    indexed on terms. Each value is a list of [year, id, context] triples."""
    tag_files = list(filename_generator(dataset_tags.path, filelist))
    feat_files = list(filename_generator(dataset_feats.path, filelist))
    contexts = {}
    for i in range(len(tag_files)):
        if verbose:
            print '[collect_contexts]', tag_files[i]
            print '[collect_contexts]', feat_files[i]
        fd = FileData(tag_files[i], feat_files[i])
        for term in fd.get_terms():
            term_obj = fd.get_term(term)
            for instance in term_obj.term_instances:
                contexts.setdefault(term, []).append(instance)
    return contexts
Exemplo n.º 4
0
def annotate_something(dirname, rconfig, filelist, chunks):
    """This is a stub method that explains a bit more on how to create
    annotation files. Includes scaffolding that shows how to pull information
    out of phrase feature and tag files. This is for cases when you use a list
    of files."""

    # Here is how you get the datasets
    dataset_tags = find_input_dataset(rconfig, 'd2_tag')
    dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats')

    # Check whether files from the file list are available
    check_file_availability(dataset_tags, filelist)
    check_file_availability(dataset_feats, filelist)

    # Next would typically be some way of writing down the information, the
    # following writes general information (command used, corpus directory as
    # well as git commit) and the list of files used. This also creates the
    # output directory.
    write_info(rconfig, dirname, filelist)

    # Now we can get the file names, loop over them, and extract the needed
    # information. The code below is some scaffolding if all you need is in one
    # dataset.
    fnames = filename_generator(dataset_feats.path, filelist)
    for fname in fnames:
        with open_input_file(fname) as fh:
            # extract data from the line, you may want to put it in some
            # temporary data structure
            for line in fh:
                pass

    # And this is what you do if you need information that is distributed over
    # the feature and tag files.
    tag_files = list(filename_generator(dataset_tags.path, filelist))
    feat_files = list(filename_generator(dataset_feats.path, filelist))
    for i in range(len(tag_files)):
        # the FileData object
        fd = FileData(tag_files[i], feat_files[i])
        # all term-related stuff lives in the Term object and its term_instances
        # variable, you can print to the annotation file(s) from here or first
        # build some intermediate data structure and then print the output later
        for term in fd.get_terms():
            term_obj = fd.get_term(term)
 def _create_mallet_file(self):
     self._load_phrase_labels()
     mconfig = mallet.MalletConfig(
         self.model, 'train', 'classify', '0', self.model, '/tmp',
         classifier_type="MaxEnt", number_xval=0, training_portion=0,
         prune_p=False, infogain_pruning="5000", count_pruning="3")
     mtr = mallet.MalletTraining(mconfig)
     fnames = filename_generator(self.input_dataset.path, self.file_list)
     mtr.make_utraining_file3(fnames, self.d_phr2label, verbose=VERBOSE)
     self._create_info_stats_file(mtr.stats_labeled_count, mtr.stats_unlabeled_count,
                                  mtr.stats_terms, mtr.stats_terms_y, mtr.stats_terms_n)
Exemplo n.º 6
0
 def run(self):
     self.time = time.time()
     self._find_datasets()
     self._create_info_files()
     fnames = filename_generator(self.input_dataset.path, self.file_list)
     count = 0
     fh = codecs.open(self.locations_file, 'w', encoding='utf-8')
     for fname in fnames:
         count += 1
         #if count > 5: break
         print_file_progress("Collector", count, fname, VERBOSE)
         self._process_file(fname, fh)
     self._finish()
Exemplo n.º 7
0
def annotate_inventions(dirname, rconfig, filelist, chunks):
    """Create a directory with annotation files in t0_annotation/<name>."""

    dataset_tags = find_input_dataset(rconfig, 'd2_tag')
    dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats')
    check_file_availability(dataset_tags, filelist)
    check_file_availability(dataset_feats, filelist)

    write_info(rconfig, dirname, filelist)
    outfile = os.path.join(dirname, 'annotate.inventions.unlab.txt')
    output_fh = codecs.open(outfile, 'w', encoding='utf-8')
    tag_files = list(filename_generator(dataset_tags.path, filelist))
    feat_files = list(filename_generator(dataset_feats.path, filelist))

    # add the content of the general info file as a preface
    with open(os.path.join(dirname, 'annotate.info.general.txt')) as fh:
        for line in fh:
            output_fh.write("# %s\n" % line.rstrip())
        output_fh.write("#\n")

    for i in range(len(tag_files)):
        fd = FileData(tag_files[i], feat_files[i])
        _add_file_data_to_annotation_file(output_fh, fd)
Exemplo n.º 8
0
def check_file_availability(dataset, filelist):
    """Check whether all files in filelist are available in dataset. If not,
    print a warning and exit. This method allows for possibility that the file
    was compressed."""
    file_generator = filename_generator(dataset.path, filelist)
    total = 0
    not_in_dataset = 0
    for fname in file_generator:
        total += 1
        if not os.path.exists(fname) and not os.path.exists(fname + '.gz'):
            not_in_dataset += 1
    if not_in_dataset > 0:
        sys.exit(
            "WARNING: %d/%d files in %s have not been processed yet\n         %s"
            % (not_in_dataset, total, os.path.basename(filelist), dataset))
Exemplo n.º 9
0
def collect_counts(dataset, filelist):
    """Return a dictionary with for each term the number of documents it
    appeared in. This assumes that the dataset is a d3_phr_feats dataset."""
    counts = {}
    fnames = filename_generator(dataset.path, filelist)
    for fname in fnames:
        if verbose:
            print '[collect_counts]', fname
        # TODO: this is dangerous because it makes assumptions about the
        # directory structure, something similar was the case in step2 for at
        # least the docfeats generation
        year = os.path.basename(os.path.dirname(fname))
        doc_id = os.path.basename(fname)
        with open_input_file(fname) as fh:
            docfeats = generate_doc_feats(fh, doc_id, year)
            for term in docfeats.keys():
                counts[term] = counts.get(term, 0) + 1
    return counts
 def _create_mallet_file(self):
     print "[--classify] creating vector file - %s" % os.path.basename(
         self.mallet_file)
     count = 0
     d_phr2label = train.load_phrase_labels3(self.label_file)
     fh = open_output_file(self.mallet_file, compress=False)
     stats = {'labeled_count': 0, 'unlabeled_count': 0, 'total_count': 0}
     fnames = filename_generator(self.input_dataset.path, self.file_list)
     for phr_feats_file in fnames:
         count += 1
         if VERBOSE:
             print "[--classify] %05d %s" % (count, phr_feats_file)
         train.add_file_to_utraining_test_file(
             phr_feats_file,
             fh,
             d_phr2label,
             self.d_features,
             stats,
             use_all_chunks_p=self.use_all_chunks_p)
     fh.close()
     print "[--classify]", stats
Exemplo n.º 11
0
 def run(self):
     self.time = time.time()
     if self.rconfig.corpus is not None:
         self._find_datasets()
         fnames = filename_generator(self.input_dataset.path,
                                     self.file_list)
         print fnames.next()
     else:
         fnames = [f.strip() for f in open(self.file_list).readlines()]
         print 11, fnames[0], 22
     self._create_info_files()
     with codecs.open(self.results_file1, 'w', encoding='utf-8') as fh:
         print "[--matcher] applying patterns to files"
         count = 0
         for fname in fnames:
             count += 1
             print_file_progress("Matcher", count, fname, VERBOSE)
             # if count > 10: break
             self.run_matcher_on_file(fname, fh)
     self.create_summary()
     self.feature_statistics.write_to_file()
     self._finish()