Exemplo n.º 1
0
        if len(args) != 1 or not options.dataset:
            parser.print_help()
            exit()
        return args, options
    (vocabf,), options = _cli()
    log.start(logfile=options.logfile)

    config = configparser.ConfigParser()
    config.read(options.config)

    analogy_file = datasets.getpath(options.dataset, config, eval_mode.ALL_INFO)

    configlogger.writeConfig(log, settings=[
        ('Config file', options.config),
        ('Dataset', options.dataset),
        ('Path to dataset', analogy_file),
        ('Lowercasing analogies', options.to_lower),
        ('Output vocab file', vocabf),
    ], title='Vocabulary extraction from analogy dataset')

    log.writeln('Reading %s analogies from %s...' % (options.dataset, analogy_file))
    analogies = parsers.parse(
        analogy_file,
        options.dataset,
        eval_mode.ALL_INFO,
        data_mode.String,
        to_lower=options.to_lower
    )
    log.writeln('Read {0:,} analogies in {1:,} relations.\n'.format(
        sum([len(anlg_set) for anlg_set in analogies.values()]),
        len(analogies)
Exemplo n.º 2
0
        parser.add_option('-k', dest='k',
                help='number of neighbors to use for edge construction (default: %default)',
                type='int', default=10)
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) == 0:
            parser.print_help()
            exit()
        neighbor_files = args
        return neighbor_files, options
    neighbor_files, options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        *[
            ('Neighborhood sample file %d' % (i+1), neighbor_files[i])
                for i in range(len(neighbor_files))
        ],
        ('Output file', options.outputf),
        ('Number of neighbors to include in edge construction', options.k),
    ], 'Nearest neighborhood graph generation')

    graph = buildGraph(neighbor_files, options.k)

    log.write('Writing graph to %s...' % options.outputf)
    writeGraph(graph, options.outputf)
    log.writeln('Done!')

    log.stop()
Exemplo n.º 3
0
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if (not options.inputf) or (not options.outputf) or (
                not options.vocabf):
            parser.print_help()
            exit()
        return options

    options = _cli()

    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
Exemplo n.º 4
0
                % pivot)
        else:
            validated_pivots.add(pivot)

    # write the experimental configuration
    configlogger.writeConfig('%s.config' % options.checkpointf,
                             title='DNN embedding mapping experiment',
                             settings=[
                                 ('Source embeddings', options.src_embf),
                                 ('Source embedding dimension', src_embs.size),
                                 ('Target embeddings', options.trg_embf),
                                 ('Target embedding dimension', trg_embs.size),
                                 ('Output file', options.outf),
                                 ('Pivot file', options.pivotf),
                                 ('Number of validated pivots',
                                  len(validated_pivots)),
                                 ('Checkpoint file', options.checkpointf),
                                 ('Model settings',
                                  OrderedDict([
                                      ('Random seed', options.random_seed),
                                      ('Number of layers', options.num_layers),
                                      ('Activation', options.activation),
                                      ('Number of folds', options.num_folds),
                                      ('Batch size', options.batch_size),
                                  ]))
                             ])

    log.writeln('Training manifold mapper...')
    mapped_embs = crossfoldTrain(src_embs,
                                 trg_embs,
                                 validated_pivots,
Exemplo n.º 5
0
            options.setting = settings.SINGLE_ANSWER

        em.validateCLIOptions(options)

        #return (analogy_file, results_dir, options)
        return (analogy_file, options)

    (analogy_file, options) = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    configlogger.writeConfig(log,
                             settings=[
                                 ('Dataset', options.dataset),
                                 ('Dataset file', analogy_file),
                                 ('Analogy setting',
                                  settings.name(options.setting)),
                                 ('Analogy type', options.anlg_type),
                                 ('Method', Mode.name(options.analogy_method)),
                                 ('Embedding settings',
                                  em.logCLIOptions(options)),
                             ],
                             title='Analogy completion task')

    separator = '\t' if options.tab_sep else ' '
    emb_wrapper = em.getEmbeddings(options, log=log, separator=separator)

    results = evaluate(emb_wrapper,
                       analogy_file,
                       options.dataset,
                       options.setting,
                       options.anlg_type,
                       options.analogy_method,
Exemplo n.º 6
0
            default=None)
        (options, args) = parser.parse_args()
        if len(args) != 1:
            parser.print_help()
            exit()
        return args, options

    (configf, ), options = _cli()
    config = configparser.ConfigParser()
    config.read(configf)

    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('SemCor', [
            ('XML', config['SemCor']['XML']),
            ('Labels', config['SemCor']['Labels']),
        ]),
        ('Output file', config['SemCor']['Lemmas']),
    ])

    t_sub = log.startTimer('Pre-processing SemCor text from %s...' %
                           config['SemCor']['XML'])
    (sentences_words, sentences_instances) = wsd_parser.processSentences(
        config['SemCor']['XML'], get_lemmas=True)
    log.stopTimer(t_sub,
                  message='Read {0:,} sentences in {1}s.\n'.format(
                      len(sentences_words), '{0:.2f}'))

    log.writeln('Collecting set of SemCor lemmas...')
    lemmas = set()
    for sentence_instances in sentences_instances:
Exemplo n.º 7
0
        if len(args) != 0:
            parser.print_help()
            exit()

        return options

    options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    configlogger.writeConfig(output=log, settings=[
        ('Dataset', options.mode),
        ('Using skip indices', ('None' if not options.skips_f else options.skips_f)),
        ('Embedding settings', em.logCLIOptions(options)),
        ('Scoring settings', OrderedDict([
            ('Combination of entity and string', options.use_combo),
            ('Cross comparison of entity/string', options.use_cross),
            ('Cross comparison only', options.cross_only),
            ('Using mean of scores instead of sum', options.use_mean)
        ])),
    ], title='Similarity/Relatedness experiment')

    if not options.use_combo:
        log.writeln('\nMode: %s   Method: %s\n' % (options.mode, em.name(options.repr_method)))
        separator = '\t' if options.tab_sep else ' '
        emb_wrapper = em.getEmbeddings(options, log=log, separator=separator)
    else:
        log.writeln('\nMode: %s   Method: COMBO\n' % options.mode)
        ent_embf, word_embf = options.ent_embf, options.word_embf
        separator = '\t' if options.tab_sep else ' '
Exemplo n.º 8
0
            default=None)
        (options, args) = parser.parse_args()
        if len(args) != 1:
            parser.print_help()
            exit()
        (embf, ) = args
        return embf, options

    embf, options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(
        log, [
            ('Input embedding file', embf),
            ('Input embedding file mode', options.embedding_mode),
            ('Output neighbor file', options.outputf),
            ('Ordered vocabulary file', options.vocabf),
            ('Number of nearest neighbors', options.k),
            ('Batch size', options.batch_size),
            ('Number of threads', options.threads),
            ('Partial nearest neighbors file for resuming',
             options.partial_neighbors_file),
        ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace')
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings in {1}s.\n'.format(
                      len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing node ID <-> vocab map to %s...\n' %
                    options.vocabf)
            exit()
        (mentionf, ) = args
        return mentionf, options

    ## Getting configuration settings
    mentionf, options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)
    configlogger.writeConfig(log, [
        ('Mention file', mentionf),
        ('Mention map file', options.mention_mapf),
        ('WordNet first sense baseline settings', [
            ('Output predictions file',
             options.wordnet_baseline_eval_predictions),
        ]),
        ('ELMo baseline settings', [
            ('Output predictions file',
             options.elmo_baseline_eval_predictions),
            ('SemCor embeddings', options.semcor_embf),
            ('Training lemmas file', options.training_lemmasf),
            ('Pre-calculated WN first sense backoff predictions',
             options.wordnet_baseline_input_predictions),
        ]),
    ],
                             title="ELMo WSD baselines replication")

    t_sub = log.startTimer('Reading mentions from %s...' % mentionf,
                           newline=False)
    mentions = mention_file.read(mentionf)
    log.stopTimer(t_sub, message='Read %d mentions ({0:.2f}s)' % len(mentions))

    log.writeln('Reading mention dataset data from %s...' %
            '--logfile',
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if len(args) != 1:
            parser.print_help()
            exit()
        (outf, ) = args
        return outf, options

    outf, options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Dataset configuration file', options.dataset_configf),
        ('Mention ID->dataset map file', options.wsd_mention_map_file),
        ('Mentions for test data only', options.wsd_test_only),
    ],
                             title='Mention extraction for entity linking')

    config = configparser.ConfigParser()
    config.read(options.dataset_configf)

    t_sub = log.startTimer('Generating WSD Evaluation Framework features.')
    datasets = wsd.allAsList(config, test_only=options.wsd_test_only)
    mentions = wsd.getAllMentions(
        datasets, log=log, mention_map_file=options.wsd_mention_map_file)
    log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions))

    t_sub = log.startTimer('Writing samples to %s...' % outf, newline=False)
    mention_file.write(mentions, outf)
    log.stopTimer(t_sub, message='Done ({0:.2f}s).')
Exemplo n.º 11
0
            default=None)
        (options, args) = parser.parse_args()

        if options.random_seed < 0:
            options.random_seed = int(time.time())

        if (not options.inputf) or (not options.outputf):
            parser.print_help()
            exit()
        return options

    options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input file', options.inputf),
        ('Output file', options.outputf),
        ('# samples per class', options.size),
        ('Random seed', options.random_seed),
    ], 'WordNet dataset subsampling')

    log.writeln('Reading dataset from %s...' % options.inputf)
    ds = dataset.load(options.inputf)
    log.writeln('Read {0:,} samples.\n'.format(len(ds)))

    log.writeln('Collating by class...')
    collated = collateByClass(ds)
    classes = list(collated.keys())
    classes.sort()
    for c in classes:
        log.writeln('  {0} --> {1:,}'.format(c, len(collated[c])))
        if len(collated[c]) < options.size:
            log.writeln(
    (dataset_f, ), options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Dataset file', dataset_f),
        ('Word embeddings', options.embedding_f),
        ('Batch size', options.batch_size),
        ('CNN settings', [
            ('# convolutional filters', options.num_filters),
            ('Filter width', options.filter_width),
            ('Filter v-stride', options.filter_vstride),
            ('Filter h-stride', options.filter_hstride),
            ('Pooling width', options.pool_width),
            ('Pooling h-stride', options.pool_hstride),
            ('Fully connected dimension', options.fully_connected_dim),
        ]),
        ('Training settings', [
            ('Patience', options.patience),
            ('Early stopping criterion', options.early_stopping),
            ('Max training epochs', options.max_epochs),
            ('Checkpoint file', options.checkpoint_path),
            ('Cross validation splits file', options.cross_validation_file),
            ('Number of folds', options.n_folds),
            ('Fraction of training used for dev', options.dev_size),
            ('Writing predictions to', options.predictions_file),
            ('Writing dev results to', options.dev_results_file),
            ('Random seed', options.random_seed),
        ]),
    ], 'WordNet classification experiment')

    t_sub = log.startTimer('Reading word embeddings from %s...' %
                           options.embedding_f)
        (options, args) = parser.parse_args()
        if len(args) != 1:
            parser.print_help()
            exit()
        return args, options
    (configf,), options = _cli()
    config = configparser.ConfigParser()
    config.read(configf)

    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('SemCor', [
            ('XML', config['SemCor']['XML']),
            ('Labels', config['SemCor']['Labels']),
            ('Vocab', config['SemCor']['Vocab']),
        ]),
        ('ELMo', [
            ('Weights', config['ELMo']['Weights']),
            ('Options', config['ELMo']['Options']),
        ]),
        ('Output file', config['SemCor']['Embeddings']),
    ])

    t_sub = log.startTimer('Reading SemCor labels from %s...' % config['SemCor']['Labels'])
    semcor_labels, unique_sense_IDs = wsd_parser.readLabels(config['SemCor']['Labels'])
    log.stopTimer(t_sub, message='Read {0:,} labels ({1:,} unique senses) in {2}s.\n'.format(
        len(semcor_labels), len(unique_sense_IDs), '{0:.2f}'
    ))

    t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML'])
    (sentences_words, sentences_instances) = wsd_parser.processSentences(config['SemCor']['XML'])
    log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format(
Exemplo n.º 14
0
        parser = optparse.OptionParser(usage='Usage: %prog VOCABF OUTF')
        parser.add_option('--write-lemma', dest='write_lemma',
                action='store_true', default=False,
                help='write the lemma for the synset instead of the synset ID')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        return args, options
    (vocabf, outf), options = _cli()
    log.start(logfile=options.logfile)

    configlogger.writeConfig(log, [
        ('Vocabulary file to filter to', vocabf),
        ('Output file for relations', outf),
        ('Writing lemmas', options.write_lemma),
    ], 'Filtered WordNet relation generation')

    log.writeln('Reading filter vocab from %s...' % vocabf)
    vocab = loadVocabulary(vocabf)
    log.writeln('Read {0:,} words to filter to.\n'.format(len(vocab)))

    t_sub = log.startTimer('Extracting WordNet pairs....\n')
    enumerateWordNetPairs(vocab, outf, write_lemma=options.write_lemma)
    log.stopTimer(t_sub, message='\nExtraction complete in {0:.2f}s.')

    log.stop()
Exemplo n.º 15
0
    config.read(options.config)

    analogy_file = datasets.getpath(options.dataset, config, options.setting)

    if not options.embeddings:
        options.embeddings = config['Default']['Embeddings']
        options.embeddings_mode = config['Default']['EmbeddingsMode']

    configlogger.writeConfig(
        log,
        settings=[
            ('Config file', options.config),
            ('Dataset', options.dataset),
            ('Path to dataset', analogy_file),
            ('Embeddings file', options.embeddings),
            ('Embeddings file mode', options.embeddings_mode),
            ('Analogy type', options.anlg_type),
            ('Computation method', options.analogy_method),
            ('Evaluation setting', options.setting),
            ('Predictions file', options.predictions_file),
            ('Number of predictions to report', options.report_top_k),
            ('Lowercasing analogies/embeddings', options.to_lower),
        ],
        title='Analogy completion task')

    # only one valid data mode for Google and BATS datasets
    if options.dataset in [datasets.Google, datasets.BATS
                           ] and options.anlg_type != data_mode.String:
        log.writeln(
            '[WARNING] Invalid --analogy-type setting for %s dataset; Overriding to "%s"'
            % (options.dataset, data_mode.String))
        options.anlg_type = data_mode.String
                help='(REQUIRED) file to write filtered word embeddings to')
        parser.add_option('-d', '--dataset', dest='datasetf',
                help='(REQUIRED) pre-generated dataset for filtering')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if (not options.inputf) or (not options.outputf) or (not options.datasetf):
            parser.print_help()
            exit()
        return options
    options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input embeddings file', options.inputf),
        ('Output embeddings file', options.outputf),
        ('Dataset file', options.datasetf),
    ], 'Embedding filtering for WordNet classification experiments')

    t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf)
    embeddings = pyemblib.read(options.inputf)
    log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(
        len(embeddings), '{0:.2f}'
    ))

    log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf)
    ds = dataset.load(options.datasetf)
    vocab = set()
    for (_, src, snk, _) in ds:
        vocab.add(src)
        vocab.add(snk)