示例#1
0
def run_utest(target_path, language, version, limit, classifier='MaxEnt',
              use_all_chunks_p=True):

    """Run the classifier on n=limit documents. Batch version of the function
    train.patent_utraining_test_data(). Appends results to test/utest.1.MaxEnt.out and
    keeps intermediate results for this invocation in test/utest.1.mallet.START-END (raw
    feature vectors) and test/utest.1.MaxEnt.out.BEGIN_END, where begin and end are taken
    from ALL_STAGES.txt and the limit parameter."""
    
    # get dictionary of annotations and keep label stats (total_count == unlabeled_count
    # if use_all_chunks_p is False, otherwisetal_count == unlabeled_count + labeled_counts
    d_phr2label = train.load_phrase_labels(target_path, language)
    stats = { 'labeled_count': 0, 'unlabeled_count': 0, 'total_count': 0 }

    stages = read_stages(target_path, language)
    fnames = files_to_process(target_path, language, stages, '--utest', limit)
    (train_dir, test_dir, mallet_file, results_file, all_results_file) = \
        _classifier_io(target_path, language, version, classifier, stages)
    print "[--utest] vector file - %s" %  mallet_file
    print "[--utest] results file - %s" %  results_file

    count = 0
    fh = codecs.open(mallet_file, "a", encoding='utf-8')
    for (year, fname) in fnames:
        count += 1
        doc_feats_file = os.path.join(target_path, language, 'doc_feats', year, fname)
        if verbose:
            print "%05d %s" % (count, doc_feats_file)
        train.add_file_to_utraining_test_file(doc_feats_file, fh, d_phr2label, stats,
                                              use_all_chunks_p=use_all_chunks_p)
    fh.close()
    
    _run_classifier(train_dir, test_dir, version, classifier, mallet_file, results_file)
    #_append_classifier_results(results_file, all_results_file)
    update_stages(target_path, language, '--utest', limit)
def find_technologies_batch(patterns, target_path, language, limit, verbose):
    """Similar to find_technologies() in that it finds pattern matches in a file and
    output results in a tab separated file. The difference is that this version runs in
    batch mode and will only do those parts of the file that fall within the limit."""

    features_file = os.path.join(target_path, language, "ws", "phr_feats.all")
    matches_file = os.path.join(target_path, language, "ws", "matches.all")
    f1 = codecs.open(features_file, encoding='utf8')
    f2 = codecs.open(matches_file, 'a', encoding='utf8')

    stages = read_stages(target_path, language)
    begin = stages.get('--matcher', 0)
    end = begin + limit

    current_fname = None
    file_count = 0
    for phrase in f1:
        if file_count > end:
            break
        (fname_id, year, term, rest) = phrase.strip("\n").split("\t", 3)
        (current_fname, file_count) = _update_state(fname_id, current_fname,
                                                    file_count, begin, end,
                                                    verbose)
        if file_count > begin:
            for p in patterns:
                matched_part = _match_pattern(p, phrase)
                if not matched_part is False:
                    line = "%s\t%s\t%s\t%s\t%s\n" % (fname_id, year, p[0],
                                                     term, matched_part)
                    f2.write(line)

    update_stages(target_path, language, '--matcher', limit)
示例#3
0
def run_tag2chk(target_path, language, limit, chunk_filter):
    """Runs the np-in-context code on tagged input. Populates language/phr_occ and
    language/phr_feat. Sets the contents of config-chunk-filter.txt given the value of
    chunk_filter."""
    print "[--tag2chk] on %s/%s/tag/" % (target_path, language)

    filter_setting = "on" if chunk_filter else "off"
    _save_config(target_path, language, 'chunk-filter', filter_setting)
    #fh = open(os.path.join(target_path, language, 'config-chunk-filter.txt'), 'w')
    #filter_setting = "on" if chunk_filter else "off"
    #fh.write("chunk-filter %s\n" % filter_setting)
    #fh.close()

    stages = read_stages(target_path, language)
    fnames = files_to_process(target_path, language, stages, '--tag2chk', limit)
    count = 0
    for (year, fname) in fnames:
        count += 1
        tag_file = os.path.join(target_path, language, 'tag', year, fname)
        occ_file = os.path.join(target_path, language, 'phr_occ', year, fname)
        fea_file = os.path.join(target_path, language, 'phr_feats', year, fname)
        if verbose:
            print "[--tag2chk] %04d adding %s" % (count, occ_file)
        tag2chunk.Doc(tag_file, occ_file, fea_file, year, language, filter_p=chunk_filter)
    update_stages(target_path, language, '--tag2chk', limit)
示例#4
0
def run_txt2tag(target_path, language, limit):
    """Takes txt files and runs the tagger (and segmenter for Chinese) on them. Adds files to
    the language/tag and language/seg directories. Works on pasiphae but not on chalciope."""
    print "[--txt2tag] on %s/%s/txt/" % (target_path, language)
    stages = read_stages(target_path, language)
    tagger = txt2tag.get_tagger(language)
    segmenter = sdp.Segmenter()
    fnames = files_to_process(target_path, language, stages, '--txt2tag', limit)
    count = 0
    for year, fname in fnames:
        count += 1
        txt_file = os.path.join(target_path, language, 'txt', year, fname)
        seg_file = os.path.join(target_path, language, 'seg', year, fname)
        tag_file = os.path.join(target_path, language, 'tag', year, fname)
        if language == 'cn':
            if verbose:
                print "[--txt2tag] %04d creating %s" % (count, seg_file)
            cn_txt2seg.seg(txt_file, seg_file, segmenter)
            if verbose:
                print "[--txt2tag] %04d creating %s" % (count, tag_file)
            cn_seg2tag.tag(seg_file, tag_file, tagger)
        else:
            if verbose:
                print "[--txt2tag] %04d creating %s" % (count, tag_file)
            txt2tag.tag(txt_file, tag_file, tagger)
    update_stages(target_path, language, '--txt2tag', limit)
示例#5
0
def run_xml2txt(target_path, language, limit):
    """Takes xml files and runs the document structure parser in onto mode. Adds files
    to the language/txt directory and ds_* directories with intermediate document
    structure parser results."""
    print "[--xml2txt] on %s/%s/xml/" % (target_path, language)
    stages = read_stages(target_path, language)
    xml_parser = Parser()
    xml_parser.onto_mode = True
    mappings = {'en': 'ENGLISH', 'de': "GERMAN", 'cn': "CHINESE" }
    xml_parser.language = mappings[language]
    fnames = files_to_process(target_path, language, stages, '--xml2txt', limit)
    workspace = os.path.join(target_path, language, 'ws')
    count = 0
    for year, fname in fnames:
        count += 1
        source_file = os.path.join(target_path, language, 'xml', year, fname)
        target_file = os.path.join(target_path, language, 'txt', year, fname)
        if verbose:
            print "[--xml2txt] %04d creating %s" % (count, target_file)
        try:
            xml2txt.xml2txt(xml_parser, source_file, target_file, workspace)
        except Exception:
            fh = codecs.open(target_file, 'w')
            fh.close()
            print "[--xml2txt]      WARNING: error on", source_file
示例#6
0
def run_utrain(target_path, language, version, xval, limit):
    """Creates a mallet training file for labeled data with features as union of all
    phrase instances within a doc. Also creates a model utrain.<version>.MaxEnt.model in
    the train subdirectory. Limit is used to determine the size of the training set, as
    with run_annotate, it is not used for incrementing values in ALL_STAGES.txt. """

    stages = read_stages(target_path, language)
    fnames = files_to_process(target_path, language, stages, '--utrain', limit)
    annot_path = config_data.annotation_directory
    source_annot_lang_file = os.path.join(annot_path, language, 'phr_occ.lab')
    target_annot_lang_file = os.path.join(target_path, language, 'ws', 'phr_occ.lab')
    shutil.copyfile(source_annot_lang_file, target_annot_lang_file)
    #train.patent_utraining_data(target_path, language, version, xval, limit)
    train.patent_utraining_data2(target_path, language, fnames, version, xval)
    update_stages(target_path, language, '--utrain', limit)
示例#7
0
def run_pf2dfeats(target_path, language, limit):
    """Creates a union of the features for each chunk in a doc (for training)."""
    print "[--pf2dfeats] on %s/%s/phr_feats/" % (target_path, language)
    stages = read_stages(target_path, language)
    fnames = files_to_process(target_path, language, stages, '--pf2dfeats', limit)
    count = 0
    for (year, fname) in fnames:
        count += 1
        doc_id = os.path.splitext(os.path.basename(fname))[0]
        phr_file = os.path.join(target_path, language, 'phr_feats', year, fname)
        doc_file = os.path.join(target_path, language, 'doc_feats', year, fname)
        if verbose:
            print "[--pf2dfeats] %04d adding %s" % (count, doc_file)
        pf2dfeats.make_doc_feats(phr_file, doc_file, doc_id, year)
    update_stages(target_path, language, '--pf2dfeats', limit)
示例#8
0
def run_populate(source_path, target_path, language, limit):
    """Populate xml directory in the target directory with limit files from the source path."""
    print "[--populate] populating %s/%s/xml" % (target_path, language)
    print "[--populate] using %d files from %s" % (limit, source_path)
    stages = read_stages(target_path, language)
    fnames = files_to_process(target_path, language, stages, '--populate', limit)
    count = 0
    for (year, fname) in fnames:
        count += 1
        source_file = os.path.join(source_path, year, fname)
        target_file = os.path.join(target_path, language, 'xml', year, fname)
        if verbose:
            print "[--populate] %04d adding %s" % (count, target_file)
        shutil.copyfile(source_file, target_file)
    update_stages(target_path, language, '--populate', limit)
示例#9
0
def run_summary(target_path, language, limit):
    """Collect data from directories into workspace area: ws/doc_feats.all,
    ws/phr_feats.all and ws/phr_occ.all. All downstream processing should rely on these
    data and nothing else."""
    print "[--summary] appending to files in ws"
    #subprocess.call("sh ./cat_phr.sh %s %s" % (target_path, language), shell=True)
    stages = read_stages(target_path, language)
    fnames = files_to_process(target_path, language, stages, '--summary', limit)
    doc_feats_file = os.path.join(target_path, language, 'ws', 'doc_feats.all')
    phr_feats_file = os.path.join(target_path, language, 'ws', 'phr_feats.all')
    phr_occ_file = os.path.join(target_path, language, 'ws', 'phr_occ.all')
    fh_doc_feats = codecs.open(doc_feats_file, 'a', encoding='utf-8')
    fh_phr_feats = codecs.open(phr_feats_file, 'a', encoding='utf-8')
    fh_phr_occ = codecs.open(phr_occ_file, 'a', encoding='utf-8')
    for (year, fname) in fnames:
        doc_feats_file = os.path.join(target_path, language, 'doc_feats', year, fname)
        phr_feats_file = os.path.join(target_path, language, 'phr_feats', year, fname)
        phr_occ_file = os.path.join(target_path, language, 'phr_occ', year, fname)
        fh_doc_feats.write(codecs.open(doc_feats_file, encoding='utf-8').read())
        fh_phr_feats.write(codecs.open(phr_feats_file, encoding='utf-8').read())
        fh_phr_occ.write(codecs.open(phr_occ_file, encoding='utf-8').read())
    update_stages(target_path, language, '--summary', limit)