Exemplo n.º 1
0
                not options.vocabf):
            parser.print_help()
            exit()
        return options

    options = _cli()

    log.start(options.logfile)
    log.writeConfig([
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
    log.writeln('Writing remapped embeddings to %s...' % options.outputf)
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format)
    pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True)
    log.writeln('Done!')
                 options.unigrams_as_tfidf
                 if options.unigram_features else 'N/A'),
                ('Using Action oracle', options.action_oracle),
                ('Input predictions file', options.input_predsf),
                ('Pre-embedded mentions', options.pre_embedded),
            ]),
        ],
        title=
        "Entity linking (disambiguation) experiment using scikit-learn baseline algorithms"
    )

    ## Data loading/setup
    entity_embeds = []
    for i in range(len(options.entity_embfs)):
        f = options.entity_embfs[i]
        t_sub = log.startTimer(
            'Reading set %d of entity embeddings from %s...' % (i + 1, f))
        entity_embeds.append(pyemblib.read(f, lower_keys=True))
        log.stopTimer(t_sub,
                      message='Read %s embeddings ({0:.2f}s)\n' %
                      ('{0:,}'.format(len(entity_embeds[-1]))))

    if options.word_vocabf:
        t_sub = log.startTimer('Reading word/context vocabulary from %s...' %
                               options.word_vocabf)
        word_vocab = readVocab(options.word_vocabf)
        log.stopTimer(t_sub,
                      message='Read %s words ({0:.2f}s)\n' %
                      ('{0:,}'.format(len(word_vocab))))
    else:
        word_vocab = None
def runCrossfoldExperiment(preprocessed, preds_stream, options):
    cross_fold_metrics = []

    for i in range(len(preprocessed.splits)):
        log.writeln(
            ('\n\n{0}\n  Starting fold %d/%d\n{0}\n'.format('#' * 80)) %
            (i + 1, len(preprocessed.splits)))

        (train_ids, dev_ids, test_ids) = preprocessed.splits[i]
        train, test = [], []
        for _id in train_ids:
            if _id in preprocessed.mentions_by_id:
                train.append(preprocessed.mentions_by_id[_id])
        for _id in dev_ids:
            if _id in preprocessed.mentions_by_id:
                if options.eval_on_dev:
                    test.append(preprocessed.mentions_by_id[_id])
                else:
                    train.append(preprocessed.mentions_by_id[_id])
        if not options.eval_on_dev:
            for _id in test_ids:
                if _id in preprocessed.mentions_by_id:
                    test.append(preprocessed.mentions_by_id[_id])

        if options.unigram_features:
            unigram_vocab = getTextVocabulary(train, preprocessed, options)
            unigram_vectorizer = CountVectorizer(vocabulary=unigram_vocab,
                                                 binary=True)
        else:
            unigram_vectorizer = None

        training_features, training_labels = [], []
        for m in train:
            (feature_vector,
             label) = prepSample(m, preprocessed,
                                 preprocessed.per_fold_unigram_features[i],
                                 options)
            if feature_vector is None or label is None:
                continue
            training_features.append(feature_vector)
            training_labels.append(label)

        test_features, test_labels = [], []
        for m in test:
            (feature_vector,
             label) = prepSample(m, preprocessed,
                                 preprocessed.per_fold_unigram_features[i],
                                 options)
            if feature_vector is None or label is None:
                continue
            test_features.append(feature_vector)
            test_labels.append(label)

        log.writeln('Number of training samples: {0:,}'.format(
            len(training_labels)))
        log.writeln('Number of test samples: {0:,}\n'.format(len(test_labels)))

        if len(test_labels) == 0:
            log.writeln(
                '[WARNING] Test ids list is empty due to rounding in cross-validation splits, skipping...'
            )
            continue

        if len(set(training_labels)) == 1:
            log.writeln(
                '[WARNING] Training samples for this subset have only one label class. Skipping...'
            )
            return None

        if options.unigram_features:
            training_features = scipy.sparse.vstack(training_features)
            test_features = scipy.sparse.vstack(test_features)

        scaler = StandardScaler(with_mean=False)
        if options.normalize_features:
            training_features = scaler.fit_transform(training_features)
            test_features = scaler.transform(test_features)

        if options.classifier == Classifier.SVM:
            t = log.startTimer('Training SVM classifier...')
            classifier = sklearn.svm.SVC(kernel='linear',
                                         random_state=options.random_seed + i)
            classifier.fit(training_features, training_labels)
            log.stopTimer(t, message='Training complete in {0:.2f}s.\n')

            t = log.startTimer('Running trained SVM on test set...')
            predictions = classifier.predict(test_features)
            log.stopTimer(t, message='Complete in {0:.2f}s.\n')

        elif options.classifier == Classifier.KNN:
            t = log.startTimer('Training k-NN classifier...')
            classifier = sklearn.neighbors.KNeighborsClassifier(
                n_neighbors=5,
                #random_state=options.random_seed+i
            )
            classifier.fit(training_features, training_labels)
            log.stopTimer(t, message='Training complete in {0:.2f}s.\n')

            t = log.startTimer('Running trained k-NN on test set...')
            predictions = classifier.predict(test_features)
            log.stopTimer(t, message='Complete in {0:.2f}s.\n')

        elif options.classifier == Classifier.MLP:
            t = log.startTimer('Training MLP classifier...')
            classifier = sklearn.neural_network.multilayer_perceptron.MLPClassifier(
                max_iter=1000, random_state=options.random_seed + i)
            classifier.fit(training_features, training_labels)
            log.stopTimer(t, message='Training complete in {0:.2f}s.\n')

            t = log.startTimer('Running trained MLP on test set...')
            predictions = classifier.predict(test_features)
            log.stopTimer(t, message='Complete in {0:.2f}s.\n')

        metrics = SimpleNamespace()
        metrics.correct = 0
        metrics.total = 0

        for j in range(len(predictions)):
            if predictions[j] == test_labels[j]:
                metrics.correct += 1
            metrics.total += 1

            if preds_stream:
                preds_stream.write(
                    'Mention %d -- Pred: %d -> %s  Gold: %d -> %s\n' %
                    (test[j].ID, predictions[j],
                     test[j].candidates[predictions[j]], test_labels[j],
                     test[j].candidates[test_labels[j]]))

        metrics.accuracy = float(metrics.correct) / metrics.total
        log.writeln('Fold accuracy: {0:.2f} ({1:,}/{2:,})'.format(
            metrics.accuracy, metrics.correct, metrics.total))

        cross_fold_metrics.append(metrics)

    overall_metrics = SimpleNamespace()
    overall_metrics.correct = 0
    overall_metrics.total = 0

    log.writeln('\n\n-- Cross-validation report --\n')
    for i in range(len(cross_fold_metrics)):
        m = cross_fold_metrics[i]
        overall_metrics.correct += m.correct
        overall_metrics.total += m.total
        log.writeln('  Fold %d -- Accuracy: %f (%d/%d)' %
                    (i + 1, m.accuracy, m.correct, m.total))

    overall_metrics.accuracy = np.mean(
        [m.accuracy for m in cross_fold_metrics])
    log.writeln('\nOverall cross-validation accuracy: %f' %
                overall_metrics.accuracy)

    return overall_metrics
        ('Extraction mode', config['ExtractionMode']),
        ('Annotation directories', config['DataDirectories']),
    ]
    if config['ExtractionMode'] == 'csv':
        settings.extend([
            ('Plaintext directory', config['PlaintextDirectory']),
            ('CSV file ID pattern', config['CSVIdentifierPattern']),
            ('Plaintext file render pattern', config['PlaintextIdentifierPattern'])
        ])
    settings.extend([
        ('Output mentions file', options.outputf),
        ('Mention map file (automatic)', options.mention_map_file),
    ])
    log.writeConfig(settings, title='Mention extraction for action classification')

    t_sub = log.startTimer('Generating %s features.' % options.dataset)
    mentions, mention_map = getAllMentions(config, options,
        tokenizer=options.tokenizer, bert_vocab_file=options.bert_vocab_file,
        log=log)
    log.stopTimer(t_sub, 'Extracted {0:,} samples.'.format(len(mentions)))

    log.writeln('Writing mention map information to %s...' % options.mention_map_file)
    with open(options.mention_map_file, 'w') as stream:
        for (mention_ID, mention_info) in mention_map.items():
            stream.write('%d\t%s\n' % (mention_ID, mention_info))
    log.writeln('Wrote info for {0:,} mentions.\n'.format(len(mention_map)))

    t_sub = log.startTimer('Writing samples to %s...' % options.outputf, newline=False)
    mention_file.write(mentions, options.outputf)
    log.stopTimer(t_sub, message='Done ({0:.2f}s).')
            parser.error('Must supply --definitions')

        (mentionf, ) = args
        return mentionf, options

    ## Getting configuration settings
    mentionf, options = _cli()
    log.start(logfile=options.logfile)
    log.writeConfig([
        ('Mention file', mentionf),
        ('Entity definitions file', options.definitions_file),
        ('Restricting to main definitions only', options.main_only),
    ],
                    title="Adapted Lesk similarity baseline")

    t_sub = log.startTimer('Reading mentions from %s...' % mentionf)
    mentions = mention_file.read(mentionf)
    log.stopTimer(t_sub,
                  message='Read %s mentions ({0:.2f}s)\n' %
                  ('{0:,}'.format(len(mentions))))

    log.writeln('Reading definitions from %s...' % options.definitions_file)
    definitions = readCodeDefinitions(options.definitions_file,
                                      options.main_only)
    log.writeln('Read definitions for {0:,} codes.\n'.format(len(definitions)))

    if options.preds_file:
        preds_stream = open(options.preds_file, 'w')
    else:
        preds_stream = None
Exemplo n.º 6
0
            options.output_dir = os.path.dirname(options.input_f)
        return options

    sys.setrecursionlimit(1800)

    options = args = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Terminology file', options.input_f),
        ('Storing pickled maps to', options.output_dir),
        ('Map concepts separated by', options.sep),
        ('Removing stopword terms', options.remove_stopwords),
        ('Tokenization settings', tokenization.CLI.logOptions(options)),
    ], 'JET -- STR -> CUI file preprocessing')

    t_sub = log.startTimer('Initializing tokenizer...')
    tokenizer = tokenization.CLI.initializeTokenizer(options)
    log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n')

    t_sub = log.startTimer('Reading terminology file...')
    ngrams, entities_by_term = readTerminology(
        options.input_f,
        tokenizer,
        remove_stopwords=options.remove_stopwords,
        use_collapsed_string=options.use_collapsed_string)
    log.stopTimer(t_sub, message='Completed in {0:.2f}s.\n')

    if options.verbose:
        log.writeln('\nRead map:')
        NGramMapPrinter.prn(ngrams)
Exemplo n.º 7
0
    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Plaintext corpus file', options.input_f),
        ('Pickled ngram->term map', options.terminology_pkl_f),
        ('Output annotations file', options.output_f),
        ('Tagging settings', [
            ('Number of tagging threads', options.threads),
            ('Line queue size cap',
             'unlimited' if options.maxlines <= 0 else options.maxlines),
        ]),
        ('Tokenization settings', tokenization.CLI.logOptions(options)),
    ], 'JET -- Automated corpus tagging')

    t_sub = log.startTimer('Loading pickled strings map...')
    compiled_terminology = pickleio.read(options.terminology_pkl_f)
    log.stopTimer(t_sub, message='Done in {0:.2f}s.\n')

    t_sub = log.startTimer('Initializing tokenizer...')
    tokenizer = tokenization.CLI.initializeTokenizer(options)
    log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n')

    t_sub = log.startTimer('Tagging corpus...')
    tagCorpus(
        options.input_f,
        compiled_terminology,
        options.output_f,
        tokenizer,
        options.threads,
        max_lines_in_queue=options.maxlines,
Exemplo n.º 8
0
        return embf, options

    embf, options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Input embedding file', embf),
        ('Input embedding file mode', options.embedding_mode),
        ('Output neighbor file', options.outputf),
        ('Ordered vocabulary file', options.vocabf),
        ('Number of nearest neighbors', options.k),
        ('Batch size', options.batch_size),
        ('Number of threads', options.threads),
        ('Partial nearest neighbors file for resuming', options.partial_neighbors_file),
    ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace')
    log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf)
        writeNodeMap(emb, options.vocabf)
    else:
        log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf)
    node_map = readNodeMap(options.vocabf)

    # get the vocabulary in node ID order, and map index in emb_arr
    # to node IDs
    node_IDs = list(node_map.keys())
    node_IDs.sort()
    ordered_vocab = [