예제 #1
0
def crosslinkSubentityType(mobilities, subentities, child_op, _type, log=log):
    # sort by starting, then ending positions
    sorted_mobilities = sorted(
        mobilities,
        key = lambda mob: (100 * mob.start) + (0.00001 * mob.end)
    )
    sorted_subentities = sorted(
        subentities,
        key = lambda act: (100 * act.start) + (0.00001 * act.end)
    )

    # for each action, find its containing mobility annotation
    for subent in sorted_subentities:
        i = 0
        while i < len(sorted_mobilities) and sorted_mobilities[i].start < subent.start:
            i += 1
        if i >= len(sorted_mobilities) or sorted_mobilities[i].start > subent.start:
            i -= 1

        mob = sorted_mobilities[i]
        if (mob.start > subent.start) or (mob.end < subent.end):
            log.writeln('[WARNING] Failed to map {0} to Mobility, skipping'.format(_type))
        elif (mob.text[subent.start-mob.start:subent.end-mob.start] != subent.text):
            log.writeln('[WARNING] Text mismatch in entity crosslinking: Mobility has text "{0}", {1} has text "{2}"; skipping'.format(mob.text[subent.start-mob.start:subent.end-mob.start], _type, subent.text))
        else:
            subent.mobility = mob
            child_op(mob, subent)
def runLeskExperiment(preprocessed, definitions, preds_stream, options):
    log.writeln(('\n\n{0}\n  Starting experiment\n{0}\n'.format('#' * 80)))

    test_labels, predictions = [], []
    for m in preprocessed.mentions:
        test_labels.append(m.CUI.lower())
        predictions.append(getMostSimilar(m, definitions, default='d450'))

    metrics = SimpleNamespace()
    metrics.correct = 0
    metrics.total = 0

    for j in range(len(predictions)):
        m = preprocessed.mentions[j]

        if m.candidates[predictions[j]] == test_labels[j]:
            metrics.correct += 1
        metrics.total += 1

        if preds_stream:
            preds_stream.write(
                'Mention %d -- Pred: %d -> %s  Gold: %d -> %s\n' %
                (preprocessed.mentions[j].ID, predictions[j],
                 m.candidates[predictions[j]],
                 m.candidates.index(test_labels[j]), test_labels[j]))

    metrics.accuracy = float(metrics.correct) / metrics.total
    log.writeln('Accuracy: {0:.2f} ({1:,}/{2:,})'.format(
        metrics.accuracy, metrics.correct, metrics.total))
예제 #3
0
def matchAnnotationAndTextFiles(data_directories, text_directory, csv_id_pattern, txt_sub_pattern, log=log):
    csv_files = {}

    csv_id_getter = re.compile(csv_id_pattern)
    for csvdir in data_directories:
        for f in os.listdir(csvdir):
            match = re.match(csv_id_getter, f)
            if match:
                _id = match.groups(1)[0]
                fpath = os.path.join(csvdir, f)
                csv_files[_id] = fpath

    paired_files = {}
    for (_id, csv_path) in csv_files.items():
        txt_path = os.path.join(
            text_directory,
            txt_sub_pattern.format(_id)
        )
        if os.path.isfile(txt_path):
            paired_files[_id] = (
                csv_path,
                txt_path
            )
        else:
            log.writeln('[WARNING] Could not find plaintext file for ID {0}'.format(_id))

    return paired_files
def calculateMetricsPerCode(predictions, mentions_by_ID, eval_set):
    preds_keys = set(predictions.keys())
    if len(preds_keys - eval_set) > 0:
        log.writeln(
            '[WARNING] Predictions file includes outputs for {0} samples not included in reference evaluation set\n'
            .format(len(preds_keys - eval_set)))
        input('[Enter] to continue')

    def initMetric():
        obj = SimpleNamespace()
        obj.tp = 0
        obj.fp = 0
        obj.fn = 0
        return obj

    metrics = {}
    for mention_ID in eval_set:
        results = predictions.get(mention_ID, None)
        if results is None:
            mention = mentions_by_ID[mention_ID]
            gold_ix = mention.candidates.index(mention.CUI)
            if not gold_ix in metrics:
                metrics[gold_ix] = initMetric()
            metrics[gold_ix].fn += 1
        else:
            (scores, pred_ix, gold_ix, correct) = results
            if not pred_ix in metrics:
                metrics[pred_ix] = initMetric()
            if not gold_ix in metrics:
                metrics[gold_ix] = initMetric()

            if correct:
                metrics[gold_ix].tp += 1
            else:
                metrics[pred_ix].fp += 1
                metrics[gold_ix].fn += 1

    for (ix, code_metrics) in metrics.items():
        if code_metrics.tp + code_metrics.fp > 0:
            code_metrics.precision = (float(code_metrics.tp) /
                                      (code_metrics.tp + code_metrics.fp))
        else:
            code_metrics.precision = 0
        if code_metrics.tp + code_metrics.fn > 0:
            code_metrics.recall = (float(code_metrics.tp) /
                                   (code_metrics.tp + code_metrics.fn))
        else:
            code_metrics.recall = 0
        if code_metrics.precision + code_metrics.recall > 0:
            code_metrics.f1 = (
                (2 * code_metrics.precision * code_metrics.recall) /
                (code_metrics.precision + code_metrics.recall))
        else:
            code_metrics.f1 = 0

    return metrics
예제 #5
0
def KNearestNeighbors(emb_arr, node_IDs, top_k, neighbor_file, threads=2, batch_size=5, completed_neighbors=None):
    '''docstring goes here
    '''
    # set up threads
    log.writeln('1 | Thread initialization')
    all_indices = list(range(len(emb_arr)))
    if completed_neighbors:
        filtered_indices = []
        for ix in all_indices:
            if not ix in completed_neighbors:
                filtered_indices.append(ix)
        all_indices = filtered_indices
        log.writeln('  >> Filtered out {0:,} completed indices'.format(len(emb_arr) - len(filtered_indices)))
        log.writeln('  >> Filtered set size: {0:,}'.format(len(all_indices)))
    index_subsets = _prepareForParallel(all_indices, threads-1, data_only=True)
    nn_q = mp.Queue()
    nn_writer = mp.Process(target=_nn_writer, args=(neighbor_file, node_IDs, nn_q))
    computers = [
        mp.Process(target=_threadedNeighbors, args=(index_subsets[i], emb_arr, batch_size, top_k, nn_q))
            for i in range(threads - 1)
    ]
    nn_writer.start()
    log.writeln('2 | Neighbor computation')
    util.parallelExecute(computers)
    nn_q.put(_SIGNALS.HALT)
    nn_writer.join()
예제 #6
0
def baseParse(record, txtf):
    start_pos = int(record[1])
    end_pos = int(record[2])

    expected_text = record[3]
    with open(txtf, 'r') as stream:
        doc_text = stream.read()
    actual_text = doc_text[start_pos:end_pos]

    if expected_text != actual_text:
        log.writeln('[WARNING] Mis-alignment on {0} mention -- Expected "{1}"  Found "{2}"'.format(
            record[0], expected_text, actual_text
        ))

    return (start_pos, end_pos, expected_text)
def buildGraph(neighbor_files, k):
    log.writeln('Building neighborhood graph...')
    graph = {}

    # construct frequency-weighted edges
    log.track(message='  >> Loaded {0}/%d neighborhood files' %
              len(neighbor_files),
              writeInterval=1)
    for neighbor_file in neighbor_files:
        neighborhoods = readNeighbors(neighbor_file, k)
        for (source, neighbors) in neighborhoods.items():
            if graph.get(source, None) is None:
                graph[source] = {}
            for nbr in neighbors:
                graph[source][nbr] = graph[source].get(nbr, 0) + 1
        log.tick()
    log.flushTracker()

    log.writeln('  >> Normalizing edge weights...')
    max_count = float(len(neighbor_files))
    for (source, neighborhood) in graph.items():
        for (nbr, freq) in neighborhood.items():
            graph[source][nbr] = freq / max_count

    log.writeln('Graph complete!')
    return graph
def experimentWrapper(mentions, entity_embeds, ctx_embeds, options,
                      preds_stream):
    preprocessed = preprocessData(mentions, entity_embeds, ctx_embeds, options)

    log.writeln('Filtering mentions for these embeddings...')
    preprocessed.mentions, skipped = filterMentions(preprocessed, options)
    # re-calculate mentions_by_id to remove filtered sampled
    preprocessed.mentions_by_id = {m.ID: m for m in preprocessed.mentions}
    log.writeln(
        '  Removed {0:,} mentions with no valid features'.format(skipped))
    log.writeln('Filtered dataset size: {0:,} mentions\n'.format(
        len(preprocessed.mentions)))

    results = runCrossfoldExperiment(preprocessed, preds_stream, options)

    return results
            exit()
        return args, options

    (mentionf, predsf), options = _cli()
    log.start(logfile=options.logfile)

    log.writeConfig([
        ('Mention file', mentionf),
        ('Key remapping file', options.keymapf),
        ('Predictions file', predsf),
        ('No scores in predictions', options.no_scores),
        ('Cross-validation splits file', options.splitsf),
        ('Evaluating on development data', options.dev),
    ], 'BTRIS Mobility code-level predictions analysis')

    log.writeln('Reading mentions from %s...' % mentionf)
    mentions = mention_file.read(mentionf)
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    log.writeln('Reading splits from %s...' % options.splitsf)
    splits = cross_validation.readSplits(options.splitsf)
    log.writeln('Read {0:,} splits.\n'.format(len(splits)))

    log.writeln('Compiling evaluation set...')
    eval_set = compileEvaluationSet(splits, options.dev)
    log.writeln('Evaluating on {0:,} samples.\n'.format(len(eval_set)))

    log.writeln('Parsing predictions from %s...' % predsf)
    predictions = predictions_parser.parsePredictions(
        predsf, no_scores=options.no_scores)
    log.writeln('Read {0:,} predictions.\n'.format(len(predictions)))
    def _cli():
        import optparse
        parser = optparse.OptionParser(
            usage=
            'Usage: %prog MENTIONS [options] --entities=ENTITY_FILE --ctxs=CTX_FILE',
            description=
            'Runs the LogLinearLinker model using the embeddings in ENTITY_FILE and CTX_FILE'
            ' on the mentions in MENTIONS.')
        parser.add_option(
            '--entities',
            dest='entity_embfs',
            help='comma-separated list of entity embedding files (required)')
        parser.add_option(
            '--word-vocab',
            dest='word_vocabf',
            help=
            'file listing words to load embeddings for (one per line); if unused, loads all embeddings'
        )
        parser.add_option('--ctxs',
                          dest='ctx_embf',
                          help='context embedding file (required)')
        parser.add_option(
            '--ctxs-format',
            dest='ctx_emb_fmt',
            type='choice',
            choices=[pyemblib.Mode.Binary, pyemblib.Mode.Text],
            default=pyemblib.Mode.Text,
            help='file format of embedding file (word2vec format)')
        parser.add_option(
            '--input-predictions',
            dest='input_predsf',
            help='file with previously generated scores to include as features'
        )
        parser.add_option('--predictions',
                          dest='preds_file',
                          help='file to write prediction details to')
        parser.add_option(
            '--n-fold',
            dest='n_folds',
            type='int',
            default=10,
            help='number of folds for cross validation (default: %default)')
        parser.add_option(
            '--dev-size',
            dest='dev_size',
            type='float',
            default=0.1,
            help=
            'portion of cross-validation training data to hold back for development'
            ' (default %default; must be >0 and <1)')
        parser.add_option(
            '--cross-validation-splits',
            dest='cross_validation_file',
            help=
            'path to save cross-validation splits to (generates multiple files; optional)'
        )
        parser.add_option(
            '--normalize-features',
            dest='normalize_features',
            action='store_true',
            default=False,
            help='use sklearn feature normalization (default off)')
        parser.add_option('--classifier',
                          dest='classifier',
                          type='choice',
                          choices=Classifier.tolist(),
                          default=Classifier.default(),
                          help='classification algorithm to use')
        parser.add_option(
            '--random-seed',
            dest='random_seed',
            type='int',
            default=-1,
            help='random seed for reproducibility (defaults to epoch time)')
        parser.add_option(
            '-l',
            '--logfile',
            dest='logfile',
            help=str.format(
                'name of file to write log contents to (empty for stdout)'),
            default=None)

        hyperparameters = optparse.OptionGroup(parser,
                                               'Hyperparameter options')
        hyperparameters.add_option(
            '--eval-on-dev',
            dest='eval_on_dev',
            action='store_true',
            default=False,
            help='evaluate on development data (for hyperparam tuning)')
        hyperparameters.add_option(
            '--no-ctx-embeddings',
            dest='use_ctx_embeddings',
            action='store_false',
            default=True,
            help='dont\'t use context embeddings in features')
        hyperparameters.add_option(
            '--no-entities',
            dest='use_entity_embeddings',
            action='store_false',
            default=True,
            help='don\'t use entity embeddings at all in features')
        hyperparameters.add_option(
            '--full-entity-embeddings',
            dest='full_entity_embeddings',
            action='store_true',
            default=False,
            help=
            'use full entity embeddings instead of cosine similarity to context'
        )
        hyperparameters.add_option(
            '--unigram-features',
            dest='unigram_features',
            action='store_true',
            default=False,
            help='use unigram features (indicators unless --tfidf is specified)'
        )
        hyperparameters.add_option(
            '--tfidf',
            dest='unigrams_as_tfidf',
            action='store_true',
            default=False,
            help='use TF-IDF values for unigram features (w/r/t input samples as'
            ' documents; ignored if not using --unigram-features)')
        hyperparameters.add_option('--action-oracle',
                                   dest='action_oracle',
                                   action='store_true',
                                   default=False,
                                   help='use Action oracle')
        hyperparameters.add_option(
            '--pre-embedded',
            dest='pre_embedded',
            action='store_true',
            default=False,
            help='mention file is pre-embedded (overrides --unigram-features)')

        (options, args) = parser.parse_args()

        if options.random_seed < 0:
            options.random_seed = int(time.time())

        if options.logfile and not options.preds_file:
            options.preds_file = '%s.predictions' % (os.path.splitext(
                options.logfile)[0])

        now_stamp = datetime.strftime(datetime.now(), '%Y-%m-%d_%H-%M-%S')
        if options.logfile:
            options.logfile = '%s.%s' % (options.logfile, now_stamp)
        if options.preds_file:
            options.preds_file = '%s.%s' % (options.preds_file, now_stamp)

        if options.pre_embedded and options.unigram_features:
            log.writeln(
                '[WARNING] Cannot use --unigram-features together with --pre-embedded'
            )
            log.writeln('[WARNING] Disabling --unigram-features')
            options.unigram_features = False

        if options.use_entity_embeddings:
            options.entity_embfs = options.entity_embfs.split(',')
        else:
            options.entity_embfs = []

        def _bail(msg):
            import sys
            print(sys.argv)
            parser.print_help()
            print('\n' + msg)
            exit()

        if len(args) != 1:
            _bail('Must supply only MENTIONS')
        elif (options.use_entity_embeddings
              and len(options.entity_embfs) == 0):
            _bail('Must supply --entities')
        elif (options.use_ctx_embeddings and not options.ctx_embf):
            _bail('Must supply --ctxs')
        elif (options.dev_size <= 0 or options.dev_size >= 1):
            _bail('--dev-size must be between (0,1)')

        (mentionf, ) = args
        return mentionf, options
def runCrossfoldExperiment(preprocessed, preds_stream, options):
    cross_fold_metrics = []

    for i in range(len(preprocessed.splits)):
        log.writeln(
            ('\n\n{0}\n  Starting fold %d/%d\n{0}\n'.format('#' * 80)) %
            (i + 1, len(preprocessed.splits)))

        (train_ids, dev_ids, test_ids) = preprocessed.splits[i]
        train, test = [], []
        for _id in train_ids:
            if _id in preprocessed.mentions_by_id:
                train.append(preprocessed.mentions_by_id[_id])
        for _id in dev_ids:
            if _id in preprocessed.mentions_by_id:
                if options.eval_on_dev:
                    test.append(preprocessed.mentions_by_id[_id])
                else:
                    train.append(preprocessed.mentions_by_id[_id])
        if not options.eval_on_dev:
            for _id in test_ids:
                if _id in preprocessed.mentions_by_id:
                    test.append(preprocessed.mentions_by_id[_id])

        if options.unigram_features:
            unigram_vocab = getTextVocabulary(train, preprocessed, options)
            unigram_vectorizer = CountVectorizer(vocabulary=unigram_vocab,
                                                 binary=True)
        else:
            unigram_vectorizer = None

        training_features, training_labels = [], []
        for m in train:
            (feature_vector,
             label) = prepSample(m, preprocessed,
                                 preprocessed.per_fold_unigram_features[i],
                                 options)
            if feature_vector is None or label is None:
                continue
            training_features.append(feature_vector)
            training_labels.append(label)

        test_features, test_labels = [], []
        for m in test:
            (feature_vector,
             label) = prepSample(m, preprocessed,
                                 preprocessed.per_fold_unigram_features[i],
                                 options)
            if feature_vector is None or label is None:
                continue
            test_features.append(feature_vector)
            test_labels.append(label)

        log.writeln('Number of training samples: {0:,}'.format(
            len(training_labels)))
        log.writeln('Number of test samples: {0:,}\n'.format(len(test_labels)))

        if len(test_labels) == 0:
            log.writeln(
                '[WARNING] Test ids list is empty due to rounding in cross-validation splits, skipping...'
            )
            continue

        if len(set(training_labels)) == 1:
            log.writeln(
                '[WARNING] Training samples for this subset have only one label class. Skipping...'
            )
            return None

        if options.unigram_features:
            training_features = scipy.sparse.vstack(training_features)
            test_features = scipy.sparse.vstack(test_features)

        scaler = StandardScaler(with_mean=False)
        if options.normalize_features:
            training_features = scaler.fit_transform(training_features)
            test_features = scaler.transform(test_features)

        if options.classifier == Classifier.SVM:
            t = log.startTimer('Training SVM classifier...')
            classifier = sklearn.svm.SVC(kernel='linear',
                                         random_state=options.random_seed + i)
            classifier.fit(training_features, training_labels)
            log.stopTimer(t, message='Training complete in {0:.2f}s.\n')

            t = log.startTimer('Running trained SVM on test set...')
            predictions = classifier.predict(test_features)
            log.stopTimer(t, message='Complete in {0:.2f}s.\n')

        elif options.classifier == Classifier.KNN:
            t = log.startTimer('Training k-NN classifier...')
            classifier = sklearn.neighbors.KNeighborsClassifier(
                n_neighbors=5,
                #random_state=options.random_seed+i
            )
            classifier.fit(training_features, training_labels)
            log.stopTimer(t, message='Training complete in {0:.2f}s.\n')

            t = log.startTimer('Running trained k-NN on test set...')
            predictions = classifier.predict(test_features)
            log.stopTimer(t, message='Complete in {0:.2f}s.\n')

        elif options.classifier == Classifier.MLP:
            t = log.startTimer('Training MLP classifier...')
            classifier = sklearn.neural_network.multilayer_perceptron.MLPClassifier(
                max_iter=1000, random_state=options.random_seed + i)
            classifier.fit(training_features, training_labels)
            log.stopTimer(t, message='Training complete in {0:.2f}s.\n')

            t = log.startTimer('Running trained MLP on test set...')
            predictions = classifier.predict(test_features)
            log.stopTimer(t, message='Complete in {0:.2f}s.\n')

        metrics = SimpleNamespace()
        metrics.correct = 0
        metrics.total = 0

        for j in range(len(predictions)):
            if predictions[j] == test_labels[j]:
                metrics.correct += 1
            metrics.total += 1

            if preds_stream:
                preds_stream.write(
                    'Mention %d -- Pred: %d -> %s  Gold: %d -> %s\n' %
                    (test[j].ID, predictions[j],
                     test[j].candidates[predictions[j]], test_labels[j],
                     test[j].candidates[test_labels[j]]))

        metrics.accuracy = float(metrics.correct) / metrics.total
        log.writeln('Fold accuracy: {0:.2f} ({1:,}/{2:,})'.format(
            metrics.accuracy, metrics.correct, metrics.total))

        cross_fold_metrics.append(metrics)

    overall_metrics = SimpleNamespace()
    overall_metrics.correct = 0
    overall_metrics.total = 0

    log.writeln('\n\n-- Cross-validation report --\n')
    for i in range(len(cross_fold_metrics)):
        m = cross_fold_metrics[i]
        overall_metrics.correct += m.correct
        overall_metrics.total += m.total
        log.writeln('  Fold %d -- Accuracy: %f (%d/%d)' %
                    (i + 1, m.accuracy, m.correct, m.total))

    overall_metrics.accuracy = np.mean(
        [m.accuracy for m in cross_fold_metrics])
    log.writeln('\nOverall cross-validation accuracy: %f' %
                overall_metrics.accuracy)

    return overall_metrics
    ]
    if config['ExtractionMode'] == 'csv':
        settings.extend([
            ('Plaintext directory', config['PlaintextDirectory']),
            ('CSV file ID pattern', config['CSVIdentifierPattern']),
            ('Plaintext file render pattern', config['PlaintextIdentifierPattern'])
        ])
    settings.extend([
        ('Output mentions file', options.outputf),
        ('Mention map file (automatic)', options.mention_map_file),
    ])
    log.writeConfig(settings, title='Mention extraction for action classification')

    t_sub = log.startTimer('Generating %s features.' % options.dataset)
    mentions, mention_map = getAllMentions(config, options,
        tokenizer=options.tokenizer, bert_vocab_file=options.bert_vocab_file,
        log=log)
    log.stopTimer(t_sub, 'Extracted {0:,} samples.'.format(len(mentions)))

    log.writeln('Writing mention map information to %s...' % options.mention_map_file)
    with open(options.mention_map_file, 'w') as stream:
        for (mention_ID, mention_info) in mention_map.items():
            stream.write('%d\t%s\n' % (mention_ID, mention_info))
    log.writeln('Wrote info for {0:,} mentions.\n'.format(len(mention_map)))

    t_sub = log.startTimer('Writing samples to %s...' % options.outputf, newline=False)
    mention_file.write(mentions, options.outputf)
    log.stopTimer(t_sub, message='Done ({0:.2f}s).')

    log.stop()
예제 #13
0
            parser.error('Must provide --keys')
        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('HDF5 embeddings', options.input_f),
        ('HDF5 layer',
         ('Average' if options.layer == AVERAGE_LAYERS else options.layer)),
        ('Per-row keys', options.key_f),
        ('Mentions file', options.mentions_f),
        ('Using Action oracle', options.action_oracle),
        ('Output embedded mentions file', options.output_f),
    ], 'Embedded mentions file generation with pre-generated HDF5 features')

    log.writeln('Reading keys from %s...' % options.key_f)
    keys = readKeys(options.key_f)
    log.writeln('Read {0:,} keys.\n'.format(len(keys)))

    log.writeln('Reading textual mentions from %s...' % options.mentions_f)
    mentions = mention_file.read(options.mentions_f)
    mentions_by_id = {m.ID: m for m in mentions}
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    log.writeln('Generating embedded mentions from HDF5 file %s...' %
                options.input_f)
    new_mentions = collapseMentionEmbeddings(options.input_f, keys,
                                             options.layer, mentions_by_id,
                                             options.action_oracle)
    log.writeln('Generated {0:,} embedded mentions.\n'.format(
        len(new_mentions)))
    mentionf, options = _cli()
    log.start(logfile=options.logfile)
    log.writeConfig([
        ('Mention file', mentionf),
        ('Entity definitions file', options.definitions_file),
        ('Restricting to main definitions only', options.main_only),
    ],
                    title="Adapted Lesk similarity baseline")

    t_sub = log.startTimer('Reading mentions from %s...' % mentionf)
    mentions = mention_file.read(mentionf)
    log.stopTimer(t_sub,
                  message='Read %s mentions ({0:.2f}s)\n' %
                  ('{0:,}'.format(len(mentions))))

    log.writeln('Reading definitions from %s...' % options.definitions_file)
    definitions = readCodeDefinitions(options.definitions_file,
                                      options.main_only)
    log.writeln('Read definitions for {0:,} codes.\n'.format(len(definitions)))

    if options.preds_file:
        preds_stream = open(options.preds_file, 'w')
    else:
        preds_stream = None

    results = experimentWrapper(mentions, definitions, options, preds_stream)

    if options.preds_file:
        preds_stream.close()

    log.stop()
            options.bert_dir, '%s.compiled_output.predictions' % options.model)
        options.logfile = '%s.log' % options.output_f

        return options

    options = _cli()
    log.start(options.logfile)

    log.writeConfig([
        ('Mentions file', options.mentions_f),
        ('BERT baseline root directory', options.bert_dir),
        ('Model configuration', options.model),
        ('Output file', options.output_f),
    ], 'BERT baseline results compilation')

    log.writeln('Reading mentions from %s...' % options.mentions_f)
    mentions = mention_file.read(options.mentions_f)
    mentions_by_ID = {m.ID: m for m in mentions}
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    fold_dirs = glob.glob(os.path.join(options.bert_dir, 'fold-*'))
    log.writeln('Found {0} folds in {1}.\n'.format(len(fold_dirs),
                                                   options.bert_dir))

    with open(options.output_f, 'w') as stream:
        fold_dirs = sorted(fold_dirs)
        for i in range(len(fold_dirs)):
            log.writeln('Checking fold {0}/{1}'.format(i + 1, len(fold_dirs)))
            log.indent()

            test_f = os.path.join(fold_dirs[i], 'test.tsv')
def crossValidationSplits(dataset,
                          n_folds,
                          dev_size,
                          persistent_path=None,
                          random_seed=1,
                          log=log):
    if persistent_path and os.path.isfile('%s.fold-0.train' % persistent_path):
        log.writeln('Reading pre-existing cross validation splits from %s.' %
                    persistent_path)
        splits = readSplits(persistent_path, n_folds, id_cast=int)
    else:
        log.writeln('Generating cross-validation splits...')
        np.random.seed(random_seed)

        ids_by_class, classes = stratifyByClass(dataset)

        total_size = 0
        for (lbl, ids) in ids_by_class.items():
            total_size += len(ids)
        log.writeln('  Dataset size: {0:,}'.format(total_size))
        log.writeln('  Number of classes: {0:,}'.format(len(classes)))

        # shuffle it
        for _class in classes:
            np.random.shuffle(ids_by_class[_class])

        # figure out how many points of each class per fold
        fold_size_by_class, dev_size_by_class = getFoldAndDevSizeByClass(
            ids_by_class, n_folds, dev_size)

        labeled_splits, id_splits = [], []
        for i in range(n_folds):
            train_by_class = {}
            for _class in classes:
                train_by_class[_class] = []

            for j in range(n_folds):
                fold_by_class = {}
                for _class in classes:
                    fold_size = fold_size_by_class[_class]
                    if j < (n_folds - 1):
                        fold_by_class[_class] = ids_by_class[_class][j *
                                                                     fold_size:
                                                                     (j + 1) *
                                                                     fold_size]
                    else:
                        fold_by_class[_class] = ids_by_class[_class][
                            j * fold_size:]

                # pull test
                if j == i:
                    test_by_class = fold_by_class.copy()
                # pull dev (portion)
                elif j == ((i + 1) % n_folds):
                    dev_by_class = {}
                    for (_class, subset) in fold_by_class.items():
                        dev_by_class[
                            _class] = subset[:dev_size_by_class[_class]]
                        train_by_class[_class].extend(
                            subset[dev_size_by_class[_class]:])
                # everything else goes to training
                else:
                    for (_class, subset) in fold_by_class.items():
                        train_by_class[_class].extend(subset)

            # collapse train, dev, test to flat ID lists
            lbl_train, id_train = collapseFromByClass(train_by_class)
            lbl_dev, id_dev = collapseFromByClass(dev_by_class)
            lbl_test, id_test = collapseFromByClass(test_by_class)

            labeled_splits.append((lbl_train, lbl_dev, lbl_test))
            id_splits.append((id_train, id_dev, id_test))

            log.writeln(
                '  Fold {0} -- Train: {1:,}  Dev: {2:,}  Test: {3:,}'.format(
                    i + 1, len(id_train), len(id_dev), len(id_test)))

        if persistent_path:
            log.writeln('Writing cross validation splits to %s.' %
                        persistent_path)
            writeSplits(labeled_splits, persistent_path)

        splits = id_splits
    log.writeln()

    return splits
예제 #17
0
def extractAllEntities(data_directories,
                       log=log,
                       with_full_text=False,
                       errors='strict',
                       by_document=False,
                       polarity_type=int):
    '''
    Extract all Mobility, Action, Assistance, and Quantification entities from
    XML-formatted annotation files.

    @parameters
      data_directories :: list of directories containing .xml annotation files
      with_full_text   :: includes full document text in "full_text" field of each object
      log              :: logging object to write to (defaults to dng_logger.log)

    @returns
      mobilities      :: list of Mobility objects
      actions         :: list of Action objects
      assistances     :: list of Assistance objects
      quantifications :: list of Quantification objects
    '''
    mobilities = []
    actions = []
    assistances = []
    quantifications = []

    documents = []

    extractor = XMLEntityExtractor()

    for dir_path in data_directories:
        files = os.listdir(dir_path)

        log.writeln('Extracting data from %s...' % dir_path)
        log.track(
            message=
            '  >> Extracted entities from {0:,}/{1:,} files ({2:,} entities)',
            writeInterval=1)

        for f in files:
            fpath = os.path.join(dir_path, f)
            doc = extractor.extractMentions(fpath,
                                            with_full_text=with_full_text,
                                            errors=errors,
                                            polarity_type=polarity_type,
                                            as_document=True)

            doc.file_path = fpath
            doc.ID = f

            for m in doc.mobilities:
                m.file_ID = f
                mobilities.append(m)
            for m in doc.actions:
                m.file_ID = f
                actions.append(m)
            for m in doc.assistances:
                m.file_ID = f
                assistances.append(m)
            for m in doc.quantifications:
                m.file_ID = f
                quantifications.append(m)

            documents.append(doc)

            log.tick(
                len(files),
                len(mobilities) + len(actions) + len(assistances) +
                len(quantifications))
        log.flushTracker(
            len(files),
            len(mobilities) + len(actions) + len(assistances) +
            len(quantifications))

    if by_document:
        return documents
    else:
        return (mobilities, actions, assistances, quantifications)
예제 #18
0
        ('Input embedding file', embf),
        ('Input embedding file mode', options.embedding_mode),
        ('Output neighbor file', options.outputf),
        ('Ordered vocabulary file', options.vocabf),
        ('Number of nearest neighbors', options.k),
        ('Batch size', options.batch_size),
        ('Number of threads', options.threads),
        ('Partial nearest neighbors file for resuming', options.partial_neighbors_file),
    ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace')
    log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf)
        writeNodeMap(emb, options.vocabf)
    else:
        log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf)
    node_map = readNodeMap(options.vocabf)

    # get the vocabulary in node ID order, and map index in emb_arr
    # to node IDs
    node_IDs = list(node_map.keys())
    node_IDs.sort()
    ordered_vocab = [
        node_map[node_ID]
            for node_ID in node_IDs
    ]

    emb_arr = np.array([
        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Mentions file', options.mentions_f),
        ('Mention map file', options.mention_map_f),
        ('Number of folds', options.num_folds),
        ('Dev set size', options.dev_size),
        ('Document ID filter list', options.filter_doc_ID_f),
        ('Random seed', options.random_seed),
        ('Output file', options.output_f),
    ], 'Cross-validation splits generation')

    log.writeln('Loading mentions from %s...' % options.mentions_f)
    mentions = mention_file.read(options.mentions_f)
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    if options.filter_doc_ID_f:
        log.writeln('Reading mention map from %s...' % options.mention_map_f)
        mention_map = mention_map_lib.load(options.mention_map_f)
        log.writeln('Read mapping info for {0:,} mentions.\n'.format(
            len(mention_map)))

        log.writeln('Reading doc ID filter list from %s...' %
                    options.filter_doc_ID_f)
        filter_doc_IDs = readFilterDocIDSet(options.filter_doc_ID_f)
        filtered_mentions = []
        for m in mentions:
            if mention_map[m.ID] in filter_doc_IDs:
예제 #20
0
    ], 'JET -- STR -> CUI file preprocessing')

    t_sub = log.startTimer('Initializing tokenizer...')
    tokenizer = tokenization.CLI.initializeTokenizer(options)
    log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n')

    t_sub = log.startTimer('Reading terminology file...')
    ngrams, entities_by_term = readTerminology(
        options.input_f,
        tokenizer,
        remove_stopwords=options.remove_stopwords,
        use_collapsed_string=options.use_collapsed_string)
    log.stopTimer(t_sub, message='Completed in {0:.2f}s.\n')

    if options.verbose:
        log.writeln('\nRead map:')
        NGramMapPrinter.prn(ngrams)

        log.writeln('\nTerm ID-Entity mapping:')
        for term_ID in entities_by_term.keys():
            log.writeln('  %s -> %s' % (term_ID, entities_by_term[term_ID]))

    picklebase = os.path.join(
        options.output_dir,
        os.path.splitext(os.path.basename(options.input_f))[0])

    term_to_string_map_f = '%s.term_to_string_map.txt' % picklebase
    t_sub = log.startTimer('Writing term ID-string map to %s...' %
                           term_to_string_map_f)
    writeTermStringMap(ngrams, term_to_string_map_f)
    log.stopTimer(t_sub)
            default=10)
        parser.add_option(
            '-l',
            '--logfile',
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if len(args) == 0:
            parser.print_help()
            exit()
        neighbor_files = args
        return neighbor_files, options

    neighbor_files, options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        *[('Neighborhood sample file %d' % (i + 1), neighbor_files[i])
          for i in range(len(neighbor_files))],
        ('Output file', options.outputf),
        ('Number of neighbors to include in edge construction', options.k),
    ], 'Nearest neighborhood graph generation')

    graph = buildGraph(neighbor_files, options.k)

    log.write('Writing graph to %s...' % options.outputf)
    writeGraph(graph, options.outputf)
    log.writeln('Done!')

    log.stop()
예제 #22
0
        return options

    options = _cli()

    log.start(options.logfile)
    log.writeConfig([
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
    log.writeln('Writing remapped embeddings to %s...' % options.outputf)
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format)
    pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True)
    log.writeln('Done!')

    log.stop()
        if not options.bert_f:
            parser.error('Must provide --bert-output')
        elif not options.overlaps_f:
            parser.error('Must provide --overlaps')
        elif not options.output_f:
            parser.error('Must provide --output')

        return options
    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('BERT output', options.bert_f),
        ('Overlaps file', options.overlaps_f),
        ('Output file', options.output_f),
    ], 'BERT embedding recombination')

    log.writeln('Reading overlaps from %s...' % options.overlaps_f)
    overlaps = readOverlaps(options.overlaps_f)
    log.writeln('Read overlaps for {0:,} lines.\n'.format(len(overlaps)))

    log.writeln('Streaming BERT output conversion...')
    streamingBERTConvert(
        options.bert_f,
        overlaps,
        options.output_f,
        options.tokenized_f
    )
    log.writeln('Done.')

    log.stop()
예제 #24
0
        (options, args) = parser.parse_args()
        if not options.corpus_f:
            parser.print_help()
            parser.error('Must provide --corpus')
        if not options.annotations_f:
            parser.print_help()
            parser.error('Must provide --annotations')
        if not options.term_strings_f:
            parser.print_help()
            parser.error('Must provide --term-strings')
        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Corpus file', options.corpus_f),
        ('Annotations file', options.annotations_f),
        ('Term strings file', options.term_strings_f),
    ], 'JET annotation validation')

    log.writeln('Reading term->strings mapping from %s...' %
                options.term_strings_f)
    term_map = readTermMap(options.term_strings_f)
    log.writeln('Mapped strings for {0:,} terms.\n'.format(len(term_map)))

    log.writeln('Validating corpus annotations...')
    validate(options.corpus_f, options.annotations_f, term_map)
    log.writeln('Done!\n')

    log.stop()
        ('Input file', options.input_file),
        ('Output settings', [
            ('Base path', options.output_file),
            ('Tokenized file', output_tokens),
            ('Subsequences file', output_subsequences),
            ('Overlaps file', output_overlaps),
            ('Log file', output_log),
        ]),
        ('Max subsequence length', options.max_sequence_length),
        ('Overlap fraction', options.overlap),
        ('BERT vocab file', options.vocab_file)
    ])

    options.max_sequence_length -= 2

    log.writeln('Tokenizing input file %s...' % options.input_file)
    tokenizer = bert.tokenization.FullTokenizer(
        vocab_file=options.vocab_file,
        do_lower_case=True
    )
    num_lines = 0
    with open(options.input_file, 'r') as input_stream, \
         open(output_tokens, 'w') as output_stream:
        for line in input_stream:
            tokens = tokenizer.tokenize(line.strip())
            output_stream.write('%s\n' % (' '.join(tokens)))
            num_lines += 1
    log.writeln('Wrote {0:,} tokenized lines.\n'.format(num_lines))

    log.writeln('Reading tokenized lines from %s...' % output_tokens)
    tokenized_lines = []