dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if (not options.inputf) or (not options.outputf) or ( not options.vocabf): parser.print_help() exit() return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()}
(options, args) = parser.parse_args() if not options.corpus_f: parser.print_help() parser.error('Must provide --corpus') if not options.annotations_f: parser.print_help() parser.error('Must provide --annotations') if not options.term_strings_f: parser.print_help() parser.error('Must provide --term-strings') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Corpus file', options.corpus_f), ('Annotations file', options.annotations_f), ('Term strings file', options.term_strings_f), ], 'JET annotation validation') log.writeln('Reading term->strings mapping from %s...' % options.term_strings_f) term_map = readTermMap(options.term_strings_f) log.writeln('Mapped strings for {0:,} terms.\n'.format(len(term_map))) log.writeln('Validating corpus annotations...') validate(options.corpus_f, options.annotations_f, term_map) log.writeln('Done!\n') log.stop()
log.writeConfig( [ ('Mention file', mentionf), ('Entity embedding settings', entity_settings), ('Word/ctx embeddings', options.ctx_embf), ('Word vocabulary (unused if empty)', options.word_vocabf), ('Writing predictions to', options.preds_file), ('Using feature normalization', options.normalize_features), ('Classification algorithm', options.classifier), ('Training settings', [ ('Cross validation splits file', options.cross_validation_file), ('Number of folds', options.n_folds), ('Fraction of training used for dev', options.dev_size), ('Random seed', options.random_seed), ]), ('Hyperparameter settings', [ ('Evaluating on development data', options.eval_on_dev), ('Using entity embeddings at all', options.use_entity_embeddings), ('Using full entity embeddings instead of cos sim', options.full_entity_embeddings), ('Using context embeddings', options.use_ctx_embeddings), ('Including unigram features', options.unigram_features), ('Using TF-IDF values for unigram features', options.unigrams_as_tfidf if options.unigram_features else 'N/A'), ('Using Action oracle', options.action_oracle), ('Input predictions file', options.input_predsf), ('Pre-embedded mentions', options.pre_embedded), ]), ], title= "Entity linking (disambiguation) experiment using scikit-learn baseline algorithms" )
(options, args) = parser.parse_args() if not options.splitsf: parser.print_help() parser.error('Must provide --splits') if len(args) != 2: parser.print_help() exit() return args, options (mentionf, predsf), options = _cli() log.start(logfile=options.logfile) log.writeConfig([ ('Mention file', mentionf), ('Key remapping file', options.keymapf), ('Predictions file', predsf), ('No scores in predictions', options.no_scores), ('Cross-validation splits file', options.splitsf), ('Evaluating on development data', options.dev), ], 'BTRIS Mobility code-level predictions analysis') log.writeln('Reading mentions from %s...' % mentionf) mentions = mention_file.read(mentionf) log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) log.writeln('Reading splits from %s...' % options.splitsf) splits = cross_validation.readSplits(options.splitsf) log.writeln('Read {0:,} splits.\n'.format(len(splits))) log.writeln('Compiling evaluation set...') eval_set = compileEvaluationSet(splits, options.dev) log.writeln('Evaluating on {0:,} samples.\n'.format(len(eval_set)))
if len(args) != 1: parser.print_help() parser.error('Must supply only MENTIONS') if not options.definitions_file: parser.print_help() parser.error('Must supply --definitions') (mentionf, ) = args return mentionf, options ## Getting configuration settings mentionf, options = _cli() log.start(logfile=options.logfile) log.writeConfig([ ('Mention file', mentionf), ('Entity definitions file', options.definitions_file), ('Restricting to main definitions only', options.main_only), ], title="Adapted Lesk similarity baseline") t_sub = log.startTimer('Reading mentions from %s...' % mentionf) mentions = mention_file.read(mentionf) log.stopTimer(t_sub, message='Read %s mentions ({0:.2f}s)\n' % ('{0:,}'.format(len(mentions)))) log.writeln('Reading definitions from %s...' % options.definitions_file) definitions = readCodeDefinitions(options.definitions_file, options.main_only) log.writeln('Read definitions for {0:,} codes.\n'.format(len(definitions))) if options.preds_file:
'N/A' if options.tokenizer != Tokenizer.BERT else options.bert_vocab_file )), ('Extraction mode', config['ExtractionMode']), ('Annotation directories', config['DataDirectories']), ] if config['ExtractionMode'] == 'csv': settings.extend([ ('Plaintext directory', config['PlaintextDirectory']), ('CSV file ID pattern', config['CSVIdentifierPattern']), ('Plaintext file render pattern', config['PlaintextIdentifierPattern']) ]) settings.extend([ ('Output mentions file', options.outputf), ('Mention map file (automatic)', options.mention_map_file), ]) log.writeConfig(settings, title='Mention extraction for action classification') t_sub = log.startTimer('Generating %s features.' % options.dataset) mentions, mention_map = getAllMentions(config, options, tokenizer=options.tokenizer, bert_vocab_file=options.bert_vocab_file, log=log) log.stopTimer(t_sub, 'Extracted {0:,} samples.'.format(len(mentions))) log.writeln('Writing mention map information to %s...' % options.mention_map_file) with open(options.mention_map_file, 'w') as stream: for (mention_ID, mention_info) in mention_map.items(): stream.write('%d\t%s\n' % (mention_ID, mention_info)) log.writeln('Wrote info for {0:,} mentions.\n'.format(len(mention_map))) t_sub = log.startTimer('Writing samples to %s...' % options.outputf, newline=False) mention_file.write(mentions, options.outputf)
(options, args) = parser.parse_args() if not options.bert_f: parser.error('Must provide --bert-output') elif not options.overlaps_f: parser.error('Must provide --overlaps') elif not options.output_f: parser.error('Must provide --output') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('BERT output', options.bert_f), ('Overlaps file', options.overlaps_f), ('Output file', options.output_f), ], 'BERT embedding recombination') log.writeln('Reading overlaps from %s...' % options.overlaps_f) overlaps = readOverlaps(options.overlaps_f) log.writeln('Read overlaps for {0:,} lines.\n'.format(len(overlaps))) log.writeln('Streaming BERT output conversion...') streamingBERTConvert( options.bert_f, overlaps, options.output_f, options.tokenized_f ) log.writeln('Done.')
parser.error('Must supply --bert-dir') elif not options.model: parser.error('Must supply --model') options.output_f = os.path.join( options.bert_dir, '%s.compiled_output.predictions' % options.model) options.logfile = '%s.log' % options.output_f return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Mentions file', options.mentions_f), ('BERT baseline root directory', options.bert_dir), ('Model configuration', options.model), ('Output file', options.output_f), ], 'BERT baseline results compilation') log.writeln('Reading mentions from %s...' % options.mentions_f) mentions = mention_file.read(options.mentions_f) mentions_by_ID = {m.ID: m for m in mentions} log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) fold_dirs = glob.glob(os.path.join(options.bert_dir, 'fold-*')) log.writeln('Found {0} folds in {1}.\n'.format(len(fold_dirs), options.bert_dir)) with open(options.output_f, 'w') as stream: fold_dirs = sorted(fold_dirs) for i in range(len(fold_dirs)):
(options, args) = parser.parse_args() if not options.input_f: parser.print_help() parser.error('Must provide --input') if not options.output_dir: options.output_dir = os.path.dirname(options.input_f) return options sys.setrecursionlimit(1800) options = args = _cli() log.start(options.logfile) log.writeConfig([ ('Terminology file', options.input_f), ('Storing pickled maps to', options.output_dir), ('Map concepts separated by', options.sep), ('Removing stopword terms', options.remove_stopwords), ('Tokenization settings', tokenization.CLI.logOptions(options)), ], 'JET -- STR -> CUI file preprocessing') t_sub = log.startTimer('Initializing tokenizer...') tokenizer = tokenization.CLI.initializeTokenizer(options) log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n') t_sub = log.startTimer('Reading terminology file...') ngrams, entities_by_term = readTerminology( options.input_f, tokenizer, remove_stopwords=options.remove_stopwords, use_collapsed_string=options.use_collapsed_string) log.stopTimer(t_sub, message='Completed in {0:.2f}s.\n')
if not options.mentions_f: parser.error('Must supply --mentions') elif not options.output_f: parser.error('Must supply --output') elif options.filter_doc_ID_f and not options.mention_map_f: parser.error('Must supply --mention-map if using --filter-doc-IDs') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Mentions file', options.mentions_f), ('Mention map file', options.mention_map_f), ('Number of folds', options.num_folds), ('Dev set size', options.dev_size), ('Document ID filter list', options.filter_doc_ID_f), ('Random seed', options.random_seed), ('Output file', options.output_f), ], 'Cross-validation splits generation') log.writeln('Loading mentions from %s...' % options.mentions_f) mentions = mention_file.read(options.mentions_f) log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) if options.filter_doc_ID_f: log.writeln('Reading mention map from %s...' % options.mention_map_f) mention_map = mention_map_lib.load(options.mention_map_f) log.writeln('Read mapping info for {0:,} mentions.\n'.format( len(mention_map)))
if not options.output_f: parser.print_help() parser.error('Must provide --output') if options.threads < 3: parser.print_help() parser.error('--threads must be at least 3') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Plaintext corpus file', options.input_f), ('Pickled ngram->term map', options.terminology_pkl_f), ('Output annotations file', options.output_f), ('Tagging settings', [ ('Number of tagging threads', options.threads), ('Line queue size cap', 'unlimited' if options.maxlines <= 0 else options.maxlines), ]), ('Tokenization settings', tokenization.CLI.logOptions(options)), ], 'JET -- Automated corpus tagging') t_sub = log.startTimer('Loading pickled strings map...') compiled_terminology = pickleio.read(options.terminology_pkl_f) log.stopTimer(t_sub, message='Done in {0:.2f}s.\n') t_sub = log.startTimer('Initializing tokenizer...') tokenizer = tokenization.CLI.initializeTokenizer(options) log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n') t_sub = log.startTimer('Tagging corpus...')
default=10) parser.add_option( '-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) == 0: parser.print_help() exit() neighbor_files = args return neighbor_files, options neighbor_files, options = _cli() log.start(options.logfile) log.writeConfig([ *[('Neighborhood sample file %d' % (i + 1), neighbor_files[i]) for i in range(len(neighbor_files))], ('Output file', options.outputf), ('Number of neighbors to include in edge construction', options.k), ], 'Nearest neighborhood graph generation') graph = buildGraph(neighbor_files, options.k) log.write('Writing graph to %s...' % options.outputf) writeGraph(graph, options.outputf) log.writeln('Done!') log.stop()
help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 1: parser.print_help() exit() (embf,) = args return embf, options embf, options = _cli() log.start(options.logfile) log.writeConfig([ ('Input embedding file', embf), ('Input embedding file mode', options.embedding_mode), ('Output neighbor file', options.outputf), ('Ordered vocabulary file', options.vocabf), ('Number of nearest neighbors', options.k), ('Batch size', options.batch_size), ('Number of threads', options.threads), ('Partial nearest neighbors file for resuming', options.partial_neighbors_file), ], 'k Nearest Neighbor calculation with cosine similarity') t_sub = log.startTimer('Reading embeddings from %s...' % embf) emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace') log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(len(emb), '{0:.2f}')) if not os.path.isfile(options.vocabf): log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf) writeNodeMap(emb, options.vocabf) else: log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf) node_map = readNodeMap(options.vocabf)
(options, args) = parser.parse_args() if not options.input_f: parser.error('Must provide --input') elif not options.output_f: parser.error('Must provide --output') elif not options.key_f: parser.error('Must provide --keys') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('HDF5 embeddings', options.input_f), ('HDF5 layer', ('Average' if options.layer == AVERAGE_LAYERS else options.layer)), ('Per-row keys', options.key_f), ('Mentions file', options.mentions_f), ('Using Action oracle', options.action_oracle), ('Output embedded mentions file', options.output_f), ], 'Embedded mentions file generation with pre-generated HDF5 features') log.writeln('Reading keys from %s...' % options.key_f) keys = readKeys(options.key_f) log.writeln('Read {0:,} keys.\n'.format(len(keys))) log.writeln('Reading textual mentions from %s...' % options.mentions_f) mentions = mention_file.read(options.mentions_f) mentions_by_id = {m.ID: m for m in mentions} log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) log.writeln('Generating embedded mentions from HDF5 file %s...' %
return options options = _cli() output_tokens = '%s.tokens' % options.output_file output_subsequences = '%s.subsequences' % options.output_file output_overlaps = '%s.overlaps' % options.output_file output_log = '%s.log' % options.output_file log.start(output_log) log.writeConfig([ ('Input file', options.input_file), ('Output settings', [ ('Base path', options.output_file), ('Tokenized file', output_tokens), ('Subsequences file', output_subsequences), ('Overlaps file', output_overlaps), ('Log file', output_log), ]), ('Max subsequence length', options.max_sequence_length), ('Overlap fraction', options.overlap), ('BERT vocab file', options.vocab_file) ]) options.max_sequence_length -= 2 log.writeln('Tokenizing input file %s...' % options.input_file) tokenizer = bert.tokenization.FullTokenizer( vocab_file=options.vocab_file, do_lower_case=True ) num_lines = 0