if len(args) != 1 or not options.dataset: parser.print_help() exit() return args, options (vocabf,), options = _cli() log.start(logfile=options.logfile) config = configparser.ConfigParser() config.read(options.config) analogy_file = datasets.getpath(options.dataset, config, eval_mode.ALL_INFO) configlogger.writeConfig(log, settings=[ ('Config file', options.config), ('Dataset', options.dataset), ('Path to dataset', analogy_file), ('Lowercasing analogies', options.to_lower), ('Output vocab file', vocabf), ], title='Vocabulary extraction from analogy dataset') log.writeln('Reading %s analogies from %s...' % (options.dataset, analogy_file)) analogies = parsers.parse( analogy_file, options.dataset, eval_mode.ALL_INFO, data_mode.String, to_lower=options.to_lower ) log.writeln('Read {0:,} analogies in {1:,} relations.\n'.format( sum([len(anlg_set) for anlg_set in analogies.values()]), len(analogies)
parser.add_option('-k', dest='k', help='number of neighbors to use for edge construction (default: %default)', type='int', default=10) parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) == 0: parser.print_help() exit() neighbor_files = args return neighbor_files, options neighbor_files, options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ *[ ('Neighborhood sample file %d' % (i+1), neighbor_files[i]) for i in range(len(neighbor_files)) ], ('Output file', options.outputf), ('Number of neighbors to include in edge construction', options.k), ], 'Nearest neighborhood graph generation') graph = buildGraph(neighbor_files, options.k) log.write('Writing graph to %s...' % options.outputf) writeGraph(graph, options.outputf) log.writeln('Done!') log.stop()
dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if (not options.inputf) or (not options.outputf) or ( not options.vocabf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()}
% pivot) else: validated_pivots.add(pivot) # write the experimental configuration configlogger.writeConfig('%s.config' % options.checkpointf, title='DNN embedding mapping experiment', settings=[ ('Source embeddings', options.src_embf), ('Source embedding dimension', src_embs.size), ('Target embeddings', options.trg_embf), ('Target embedding dimension', trg_embs.size), ('Output file', options.outf), ('Pivot file', options.pivotf), ('Number of validated pivots', len(validated_pivots)), ('Checkpoint file', options.checkpointf), ('Model settings', OrderedDict([ ('Random seed', options.random_seed), ('Number of layers', options.num_layers), ('Activation', options.activation), ('Number of folds', options.num_folds), ('Batch size', options.batch_size), ])) ]) log.writeln('Training manifold mapper...') mapped_embs = crossfoldTrain(src_embs, trg_embs, validated_pivots,
options.setting = settings.SINGLE_ANSWER em.validateCLIOptions(options) #return (analogy_file, results_dir, options) return (analogy_file, options) (analogy_file, options) = _cli() log.start(logfile=options.logfile, stdout_also=True) configlogger.writeConfig(log, settings=[ ('Dataset', options.dataset), ('Dataset file', analogy_file), ('Analogy setting', settings.name(options.setting)), ('Analogy type', options.anlg_type), ('Method', Mode.name(options.analogy_method)), ('Embedding settings', em.logCLIOptions(options)), ], title='Analogy completion task') separator = '\t' if options.tab_sep else ' ' emb_wrapper = em.getEmbeddings(options, log=log, separator=separator) results = evaluate(emb_wrapper, analogy_file, options.dataset, options.setting, options.anlg_type, options.analogy_method,
default=None) (options, args) = parser.parse_args() if len(args) != 1: parser.print_help() exit() return args, options (configf, ), options = _cli() config = configparser.ConfigParser() config.read(configf) log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('SemCor', [ ('XML', config['SemCor']['XML']), ('Labels', config['SemCor']['Labels']), ]), ('Output file', config['SemCor']['Lemmas']), ]) t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML']) (sentences_words, sentences_instances) = wsd_parser.processSentences( config['SemCor']['XML'], get_lemmas=True) log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format( len(sentences_words), '{0:.2f}')) log.writeln('Collecting set of SemCor lemmas...') lemmas = set() for sentence_instances in sentences_instances:
if len(args) != 0: parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile, stdout_also=True) configlogger.writeConfig(output=log, settings=[ ('Dataset', options.mode), ('Using skip indices', ('None' if not options.skips_f else options.skips_f)), ('Embedding settings', em.logCLIOptions(options)), ('Scoring settings', OrderedDict([ ('Combination of entity and string', options.use_combo), ('Cross comparison of entity/string', options.use_cross), ('Cross comparison only', options.cross_only), ('Using mean of scores instead of sum', options.use_mean) ])), ], title='Similarity/Relatedness experiment') if not options.use_combo: log.writeln('\nMode: %s Method: %s\n' % (options.mode, em.name(options.repr_method))) separator = '\t' if options.tab_sep else ' ' emb_wrapper = em.getEmbeddings(options, log=log, separator=separator) else: log.writeln('\nMode: %s Method: COMBO\n' % options.mode) ent_embf, word_embf = options.ent_embf, options.word_embf separator = '\t' if options.tab_sep else ' '
default=None) (options, args) = parser.parse_args() if len(args) != 1: parser.print_help() exit() (embf, ) = args return embf, options embf, options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig( log, [ ('Input embedding file', embf), ('Input embedding file mode', options.embedding_mode), ('Output neighbor file', options.outputf), ('Ordered vocabulary file', options.vocabf), ('Number of nearest neighbors', options.k), ('Batch size', options.batch_size), ('Number of threads', options.threads), ('Partial nearest neighbors file for resuming', options.partial_neighbors_file), ], 'k Nearest Neighbor calculation with cosine similarity') t_sub = log.startTimer('Reading embeddings from %s...' % embf) emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace') log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(emb), '{0:.2f}')) if not os.path.isfile(options.vocabf): log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf)
exit() (mentionf, ) = args return mentionf, options ## Getting configuration settings mentionf, options = _cli() log.start(logfile=options.logfile, stdout_also=True) configlogger.writeConfig(log, [ ('Mention file', mentionf), ('Mention map file', options.mention_mapf), ('WordNet first sense baseline settings', [ ('Output predictions file', options.wordnet_baseline_eval_predictions), ]), ('ELMo baseline settings', [ ('Output predictions file', options.elmo_baseline_eval_predictions), ('SemCor embeddings', options.semcor_embf), ('Training lemmas file', options.training_lemmasf), ('Pre-calculated WN first sense backoff predictions', options.wordnet_baseline_input_predictions), ]), ], title="ELMo WSD baselines replication") t_sub = log.startTimer('Reading mentions from %s...' % mentionf, newline=False) mentions = mention_file.read(mentionf) log.stopTimer(t_sub, message='Read %d mentions ({0:.2f}s)' % len(mentions)) log.writeln('Reading mention dataset data from %s...' %
'--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 1: parser.print_help() exit() (outf, ) = args return outf, options outf, options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Dataset configuration file', options.dataset_configf), ('Mention ID->dataset map file', options.wsd_mention_map_file), ('Mentions for test data only', options.wsd_test_only), ], title='Mention extraction for entity linking') config = configparser.ConfigParser() config.read(options.dataset_configf) t_sub = log.startTimer('Generating WSD Evaluation Framework features.') datasets = wsd.allAsList(config, test_only=options.wsd_test_only) mentions = wsd.getAllMentions( datasets, log=log, mention_map_file=options.wsd_mention_map_file) log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions)) t_sub = log.startTimer('Writing samples to %s...' % outf, newline=False) mention_file.write(mentions, outf) log.stopTimer(t_sub, message='Done ({0:.2f}s).')
default=None) (options, args) = parser.parse_args() if options.random_seed < 0: options.random_seed = int(time.time()) if (not options.inputf) or (not options.outputf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input file', options.inputf), ('Output file', options.outputf), ('# samples per class', options.size), ('Random seed', options.random_seed), ], 'WordNet dataset subsampling') log.writeln('Reading dataset from %s...' % options.inputf) ds = dataset.load(options.inputf) log.writeln('Read {0:,} samples.\n'.format(len(ds))) log.writeln('Collating by class...') collated = collateByClass(ds) classes = list(collated.keys()) classes.sort() for c in classes: log.writeln(' {0} --> {1:,}'.format(c, len(collated[c]))) if len(collated[c]) < options.size: log.writeln(
(dataset_f, ), options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Dataset file', dataset_f), ('Word embeddings', options.embedding_f), ('Batch size', options.batch_size), ('CNN settings', [ ('# convolutional filters', options.num_filters), ('Filter width', options.filter_width), ('Filter v-stride', options.filter_vstride), ('Filter h-stride', options.filter_hstride), ('Pooling width', options.pool_width), ('Pooling h-stride', options.pool_hstride), ('Fully connected dimension', options.fully_connected_dim), ]), ('Training settings', [ ('Patience', options.patience), ('Early stopping criterion', options.early_stopping), ('Max training epochs', options.max_epochs), ('Checkpoint file', options.checkpoint_path), ('Cross validation splits file', options.cross_validation_file), ('Number of folds', options.n_folds), ('Fraction of training used for dev', options.dev_size), ('Writing predictions to', options.predictions_file), ('Writing dev results to', options.dev_results_file), ('Random seed', options.random_seed), ]), ], 'WordNet classification experiment') t_sub = log.startTimer('Reading word embeddings from %s...' % options.embedding_f)
(options, args) = parser.parse_args() if len(args) != 1: parser.print_help() exit() return args, options (configf,), options = _cli() config = configparser.ConfigParser() config.read(configf) log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('SemCor', [ ('XML', config['SemCor']['XML']), ('Labels', config['SemCor']['Labels']), ('Vocab', config['SemCor']['Vocab']), ]), ('ELMo', [ ('Weights', config['ELMo']['Weights']), ('Options', config['ELMo']['Options']), ]), ('Output file', config['SemCor']['Embeddings']), ]) t_sub = log.startTimer('Reading SemCor labels from %s...' % config['SemCor']['Labels']) semcor_labels, unique_sense_IDs = wsd_parser.readLabels(config['SemCor']['Labels']) log.stopTimer(t_sub, message='Read {0:,} labels ({1:,} unique senses) in {2}s.\n'.format( len(semcor_labels), len(unique_sense_IDs), '{0:.2f}' )) t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML']) (sentences_words, sentences_instances) = wsd_parser.processSentences(config['SemCor']['XML']) log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format(
parser = optparse.OptionParser(usage='Usage: %prog VOCABF OUTF') parser.add_option('--write-lemma', dest='write_lemma', action='store_true', default=False, help='write the lemma for the synset instead of the synset ID') parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 2: parser.print_help() exit() return args, options (vocabf, outf), options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Vocabulary file to filter to', vocabf), ('Output file for relations', outf), ('Writing lemmas', options.write_lemma), ], 'Filtered WordNet relation generation') log.writeln('Reading filter vocab from %s...' % vocabf) vocab = loadVocabulary(vocabf) log.writeln('Read {0:,} words to filter to.\n'.format(len(vocab))) t_sub = log.startTimer('Extracting WordNet pairs....\n') enumerateWordNetPairs(vocab, outf, write_lemma=options.write_lemma) log.stopTimer(t_sub, message='\nExtraction complete in {0:.2f}s.') log.stop()
config.read(options.config) analogy_file = datasets.getpath(options.dataset, config, options.setting) if not options.embeddings: options.embeddings = config['Default']['Embeddings'] options.embeddings_mode = config['Default']['EmbeddingsMode'] configlogger.writeConfig( log, settings=[ ('Config file', options.config), ('Dataset', options.dataset), ('Path to dataset', analogy_file), ('Embeddings file', options.embeddings), ('Embeddings file mode', options.embeddings_mode), ('Analogy type', options.anlg_type), ('Computation method', options.analogy_method), ('Evaluation setting', options.setting), ('Predictions file', options.predictions_file), ('Number of predictions to report', options.report_top_k), ('Lowercasing analogies/embeddings', options.to_lower), ], title='Analogy completion task') # only one valid data mode for Google and BATS datasets if options.dataset in [datasets.Google, datasets.BATS ] and options.anlg_type != data_mode.String: log.writeln( '[WARNING] Invalid --analogy-type setting for %s dataset; Overriding to "%s"' % (options.dataset, data_mode.String)) options.anlg_type = data_mode.String
help='(REQUIRED) file to write filtered word embeddings to') parser.add_option('-d', '--dataset', dest='datasetf', help='(REQUIRED) pre-generated dataset for filtering') parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if (not options.inputf) or (not options.outputf) or (not options.datasetf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input embeddings file', options.inputf), ('Output embeddings file', options.outputf), ('Dataset file', options.datasetf), ], 'Embedding filtering for WordNet classification experiments') t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf) embeddings = pyemblib.read(options.inputf) log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(embeddings), '{0:.2f}' )) log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf) ds = dataset.load(options.datasetf) vocab = set() for (_, src, snk, _) in ds: vocab.add(src) vocab.add(snk)