Пример #1
0
def parse_words(file_obj, tag_scheme=None):
    word_list = parsed_documents_to_words(parse_conll_train(file_obj.read()))

    if tag_scheme is not None:
        tag_scheme.encode(word_list, 'gold_label')

    return word_list
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--test_file',
                        default=L1_TAGGED_TEST_FILE_PATH,
                        help='path to test file with l1 tags')
    parser.add_argument('-m',
                        '--model_file',
                        default=L2_MODEL_FILE_PATH,
                        help='path to model parameters')
    parser.add_argument('-l',
                        '--lexicon_file',
                        default=L2_CLASS_LEXICON_FILE_PATH,
                        help='path to label lexicon')
    parser.add_argument('-o',
                        '--output_file',
                        default=L2_TAGGED_TEST_FILE_PATH,
                        help='path to output file')
    args = parser.parse_args()

    logger.info('Loading trained model.')
    trained_model, class_lexicon = load_model(args.model_file,
                                              args.lexicon_file)

    logger.info('Reading test file')
    with open(args.test_file, 'rb') as f:
        test_data = f.read()

    logger.info('Parsing test data.')
    parsed_documents_list = parse_conll_tagged(test_data)
    logger.info('Parse completed (%d documents).' % len(parsed_documents_list))

    logger.info('Building word list from test data.')
    test_words = parsed_documents_to_words(parsed_documents_list)

    logger.info('Annotate test words.')
    annotate_data(test_words)

    logger.info(
        'Set tag as prev_level_tag for prev level aggregation features.')
    for word in test_words:
        word.prev_level_tag = word.tag

    logger.info('Annotate words with prev level entities.')
    annotate_prev_level_entities(test_words)

    logger.info('Start greedy decoding.')
    greedy_decoding(test_words, trained_model, class_lexicon, L2_FEATURES)

    logger.info('Convert tag scheme from BILOU to IOB.')
    BILOU.decode(test_words, tag_attr='tag')

    logger.info('Formatting output.')
    parsed_documents_list = words_to_parsed_documents(test_words)
    output_data = format_conll_tagged(parsed_documents_list)

    logger.info('Dumping output.')
    with open(args.output_file, 'wb') as f:
        f.write(output_data)
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--train_file',
                        default=L1_TRAIN_FILE_PATH,
                        help='path to training file')
    parser.add_argument('-b',
                        '--brown_dir',
                        default=BROWN_CLUSTERS_DIR_PATH,
                        help='path to brown cluster directory')
    args = parser.parse_args()

    logger.info('Reading train data.')
    with open(args.train_file, 'rb') as f:
        train_data = f.read()

    logger.info('Parsing train data.')
    parsed_document_list = parse_conll_train(train_data)
    logger.info('Parse completed (%d documents).' % len(parsed_document_list))

    logger.info('Building word list from train data.')
    training_words = parsed_documents_to_words(parsed_document_list)

    logger.info('Parsing Brown clusters directory.')
    all_clusters_dict = parse_brown_clusters_directory(args.brown_dir)

    logger.info('Calculating prefix len for each path.')
    cluster_to_path_histogram_by_offset = dict()
    cluster_to_prefix_histogram_by_offset = dict()
    cluster_to_prefix_to_len_by_offset = dict()
    for corpus_name, cluster_dict in all_clusters_dict.iteritems():
        path_histogram_by_offset = dict()
        prefix_histogram_by_offset = dict()
        path_to_len_by_offset = dict()

        for offset in (-2, -1, 0, 1, 2):
            path_to_type_histogram = build_path_histogram(
                cluster_dict, training_words, offset)
            path_histogram_by_offset[offset] = path_to_type_histogram

            prefix_to_type_histogram = build_prefix_histogram(
                path_to_type_histogram)
            prefix_histogram_by_offset[offset] = prefix_to_type_histogram

            path_to_len = get_path_to_prefix_len(path_to_type_histogram,
                                                 prefix_to_type_histogram)
            path_to_len_by_offset[offset] = path_to_len

        cluster_to_path_histogram_by_offset[
            corpus_name] = path_histogram_by_offset
        cluster_to_prefix_histogram_by_offset[
            corpus_name] = prefix_histogram_by_offset
        cluster_to_prefix_to_len_by_offset[corpus_name] = path_to_len_by_offset

    logger.info('Dumping result to file.')
    with open(BROWN_PREFIX_TO_LEN_FILE_PATH, 'wb') as f:
        json.dump(cluster_to_prefix_to_len_by_offset, f, indent=4)
Пример #4
0
def main():
    tagged_file_path = L1_TAGGED_TEST_FILE_PATH if len(sys.argv) == 1 else sys.argv[1]

    # parse word list from tagged file
    raw_data = open(tagged_file_path, 'rb').read()
    word_list = parsed_documents_to_words(parse_conll_tagged(raw_data))

    # extract tagged and gold entities (per entity type)
    tagged_entities_by_type = extract_entities_by_types(word_list, tag_attr='tag')
    gold_entities_by_type = extract_entities_by_types(word_list, tag_attr='gold_label')

    # print stats per entity type
    for group in sorted(gold_entities_by_type.keys()):
        precision, recall, f1 = get_stats(tagged_entities_by_type[group], gold_entities_by_type[group])
        print "%15s" % group,
        print 'prec: %.2f' % precision,
        print 'recall: %.2f' % recall,
        print 'f1: %.2f' % f1,
        print len(tagged_entities_by_type[group])
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--train_file',
                        default=L1_TRAIN_FILE_PATH,
                        help='path to training file')
    parser.add_argument('-m',
                        '--model_file',
                        default=L1_MODEL_FILE_PATH,
                        help='path to save trained model parameters')
    parser.add_argument('-l',
                        '--lexicon_file',
                        default=L1_CLASS_LEXICON_FILE_PATH,
                        help='path to save label lexicon')
    args = parser.parse_args()

    logger.info('Reading train data.')
    with open(args.train_file, 'rb') as f:
        train_data = f.read()

    logger.info('Parsing train data.')
    parsed_document_list = parse_conll_train(train_data)
    logger.info('Parse completed (%d documents).' % len(parsed_document_list))

    logger.info('Building word list from train data.')
    training_words = parsed_documents_to_words(parsed_document_list)

    logger.info('Annotate training data.')
    annotate_data(training_words)

    logger.info('Convert gold label tag scheme from IOB to BILOU.')
    BILOU.encode(training_words, tag_attr='gold_label')

    logger.info('Set gold label as tag (for model tag features).')
    for word in training_words:
        word.tag = word.gold_label

    logger.info('Train model.')
    trained_model_params, class_lexicon = train(training_words, L1_FEATURES,
                                                NUM_TRAIN_ITERATIONS)
    save_model(trained_model_params, args.model_file, class_lexicon,
               args.lexicon_file)
Пример #6
0
 def _document_text_to_words(document_text):
     parsed_document_mock = list()
     for sentence_text in document_text.splitlines():
         parsed_sentence_mock = map(lambda word_text: {'text': word_text}, sentence_text.split())
         parsed_document_mock.append(parsed_sentence_mock)
     return parsed_documents_to_words([parsed_document_mock])