def wordnetFirstSenseBaseline(mentions, mention_map, predsf): predictions, correct = [], 0 for m in mentions: # m.candidates is the (ranked) list returned by WordNet # Look, I have no idea how Raganato et al. got their list # out. But this can be scrapped, because it VASTLY # underperforms their FirstSense baseline. (_, _, lemma) = mention_map[m.ID] synsets = wn.synsets(lemma) if lemma == 'peculiar': print(synsets) found_it = False for j in range(len(synsets)): this_lemma = synsets[j].lemmas()[0].name() if lemma == 'peculiar': print(j, this_lemma) if this_lemma == lemma: found_it = True break if not found_it: j = 0 guess = synsets[j].lemmas()[0].key() if lemma == 'peculiar': print(j, guess) #guess = wn.synsets(lemma)[0].lemmas()[0].key() #guess = m.candidates[0] predictions.append((m.ID, guess)) if guess == m.CUI: correct += 1 writeWSDFrameworkPredictions(predictions, mention_map, predsf) log.writeln('-- WordNet first sense baseline --') log.writeln('Accuracy: {0:.4f} ({1:,}/{2:,})\n'.format( float(correct) / len(predictions), correct, len(predictions)))
def __init__(self, datadir=None, verbose=False): self._ambig_sets = {} if not datadir: datadir = _datadir for f in glob.glob(os.path.join(datadir, '*_pmids_tagged.arff')): if verbose: log.writeln(' >> Parsing %s' % f) ambig_set = parser.parseFile(f) concept = os.path.basename(f).split('_')[0] self._ambig_sets[concept] = ambig_set
def getAllMentions(dataset, window_size, word_filter, concept_filter, log=log): samples = [] log.track(message=' >> Extracted features from {0}/%d documents...' % len(dataset), writeInterval=1) for ambig in dataset: for instance in ambig.instances: if concept_filter(instance.CUI): samples.append( getSingleMention(instance, window_size, word_filter, ambig.labels)) log.tick() log.writeln() return samples
def train(model, src_embs, trg_embs, train_keys, dev_keys, batch_size=5): train_keys = list(train_keys) dev_keys = list(dev_keys) training = True batch_start, iter_losses = 0, [] prev_dev_loss = None cur_iter, new_iter = 0, True while training: if new_iter: if cur_iter > 0: # run on dev set dev_loss = evalOnDev(model, src_embs, trg_embs, dev_keys, batch_size=batch_size) log.writeln(" Iteration %d -- Dev MSE: %f" % (cur_iter, dev_loss)) if cur_iter > 1 and dev_loss > prev_dev_loss: training = False log.writeln(' >> Reached dev-based convergence <<') else: prev_dev_loss = dev_loss # save checkpoint model.checkpoint(cur_iter) # set up for next training batch random.shuffle(train_keys) cur_iter += 1 batch_start = 0 iter_losses = [] new_iter = False if training: batch_keys = train_keys[batch_start:batch_start + batch_size] batch_src = np.array([src_embs[k] for k in batch_keys]) batch_trg = np.array([trg_embs[k] for k in batch_keys]) loss = model.train_batch(batch_src, batch_trg) iter_losses.append(loss) batch_start += batch_size if batch_start >= len(train_keys): new_iter = True model.rollback()
def twoModelEvaluate(dataset, ent_emb_wrapper, str_emb_wrapper, sim_metric, log_predictions=False, use_cross=False, cross_only=False, use_mean=False, skips_f=None): log.writeln('\n\n Using cross: %s\n Using cross only: %s\n Using mean: %s\n' % (str(use_cross), str(cross_only), str(use_mean))) # check to see how many dataset items are comparable comparable, full_comparable = [], [] prepared = prepare(dataset.full_data) if skips_f: skips = readSkips(skips_f) skips = skips.get(dataset.name, set()) else: skips = set() for i in range(len(prepared)): if i in skips: continue full_datum = prepared[i] (e_1, str_1, e_2, str_2, _) = full_datum if ent_emb_wrapper.knows(e_1) and ent_emb_wrapper.knows(e_2) \ and str_emb_wrapper.knows(str_1) and str_emb_wrapper.knows(str_2): comparable.append(full_datum) full_comparable.append(dataset.full_data[i]) else: log.writeln('SKIPPING %d' % i) gold, pred = [], [] for (e_1, str_1, e_2, str_2, gold_metric) in comparable: gold.append(gold_metric) scores = [ sim_metric(ent_emb_wrapper[e_1], ent_emb_wrapper[e_2]), sim_metric(str_emb_wrapper[str_1], str_emb_wrapper[str_2]) ] if use_cross: if cross_only: scores = [] scores.append(sim_metric(ent_emb_wrapper[e_1], str_emb_wrapper[str_2])) scores.append(sim_metric(str_emb_wrapper[str_1], ent_emb_wrapper[e_2])) if use_mean: pred.append(np.mean(scores)) else: pred.append(np.sum(scores)) if log_predictions: logPredictions(full_comparable, pred, gold, dataset.name, log=log) (rho, _) = spearmanr(gold, pred) return rho, len(comparable), len(dataset.data)
def calculateNearestNeighbors(embeds, outf, top_k=100, batch_size=100, threads=1): log.writeln('Calculating nearest neighbors') keys = tuple(embeds.keys()) emb_list = [embeds[k] for k in keys] all_ixes = range(len(keys)) thread_chunks = util.prepareForParallel(all_ixes, threads, data_only=True) nn_q = mp.Queue() calc_threads = [ mp.Process(target=_threadedNearestNeighbors, args=(thread_chunks[i], batch_size, top_k, emb_list, nn_q)) for i in range(threads) ] collator = mp.Process(target=_collate, args=(keys, (len(keys)//batch_size)+1, nn_q, outf)) collator.start() util.parallelExecute(calc_threads) nn_q.put(_SIGNALS.HALT) collator.join()
def getAllMentions(datasets, log=log, mention_map_file=None): ds_map = {} # pre-generate the vocabulary of all datasets all_sentences = [] for ds in datasets: all_sentences.extend(ds.sentences_words) prepVocabulary(all_sentences, datasets[0].config['Experiment']['TotalVocab']) params = ELMoParams( options_file=datasets[0].config['ELMo']['Options'], weights_file=datasets[0].config['ELMo']['Weights'], vocab_file=datasets[0].config['Experiment']['TotalVocab'], max_char_len=int(datasets[0].config['ELMo']['MaxCharLen']), ) elmo_batch_size = int(datasets[0].config['ELMo']['BatchSize']) sess = tf.Session() elmo = ELMoRunner(sess, params) samples = [] for ds in datasets: log.writeln('\nProcessing dataset %s...' % ds.name) _getELMoMentions(ds.sentences_words, ds.sentences_instances, ds.labels, ds.name, samples, ds_map, elmo, batch_size=elmo_batch_size) if mention_map_file: with open(mention_map_file, 'w') as stream: for (mention_ID, (ds_name, instance_ID, lemma)) in ds_map.items(): stream.write('%d\t%s\t%s\t%s\n' % (mention_ID, ds_name, instance_ID, lemma)) return samples
def enumerateWordNetPairs(vocab, outf, write_lemma=False): data = [] in_vocab = lambda synset: synset.lemmas()[0].name() in vocab for pos in ['n', 'v', 'a', 'r']: n_pairs = 0 log.writeln('Processing POS "%s"' % pos) log.track(message=' >> Processed {0:,} source synsets ({1:,} pairs)', writeInterval=100) for synset in wn.all_synsets(pos): if in_vocab(synset): for (getter, lbl) in [ (synset.hyponyms, dataset.Hyponym), (synset.hypernyms, dataset.Hypernym), (synset.member_holonyms, dataset.Holonym), (synset.substance_holonyms, dataset.Holonym), (synset.part_holonyms, dataset.Holonym), (synset.member_meronyms, dataset.Meronym), (synset.substance_meronyms, dataset.Meronym), (synset.part_meronyms, dataset.Meronym), ]: for sink in getter(): if in_vocab(sink): if write_lemma: src = synset.lemmas()[0].name() snk = sink.lemmas()[0].name() else: src = synset.name() snk = sink.name() data.append(( len(data), src, snk, lbl )) n_pairs += 1 log.tick(n_pairs) log.flushTracker(n_pairs) log.writeln('') dataset.write(data, outf)
def getELMoRepresentations(sentences_words, sentences_instances, semcor_labels, unique_sense_IDs, bilm_params): sense_embeddings = {} for sense_ID in unique_sense_IDs: sense_embeddings[sense_ID] = [] with tf.Session() as sess: log.writeln(' (1) Setting up ELMo') elmo = ELMoRunner(sess, bilm_params) # batch up the data sentence_ids = elmo.preprocess(sentences_words) batch_size = 25 num_batches = math.ceil(sentence_ids.shape[0] / batch_size) batch_start = 0 log.writeln(' (2) Extracting sense embeddings from sentences') log.track(message=' >> Processed {0}/{1:,} batches'.format('{0:,}',num_batches), writeInterval=5) while batch_start < sentence_ids.shape[0]: batch_sentence_ids = sentence_ids[batch_start:batch_start + batch_size] elmo_sentence_input_ = elmo(batch_sentence_ids) for i in range(elmo_sentence_input_.shape[0]): sentence_indices = sentences_instances[batch_start+i] for (instance_ID, ix) in sentence_indices: senses = semcor_labels[instance_ID] for sense in senses: sense_embeddings[sense].append( elmo_sentence_input_[i][ix] ) log.tick() batch_start += batch_size log.flushTracker() log.writeln(' (3) Calculating mean per-sense embeddings') mean_sense_embeddings = pyemblib.Embeddings() for (sense_ID, embedding_list) in sense_embeddings.items(): if len(embedding_list) > 0: mean_sense_embeddings[sense_ID] = np.mean(embedding_list, axis=0) else: log.writeln('[WARNING] Sense ID "%s" found no embeddings' % sense_ID) return mean_sense_embeddings
def KNearestNeighbors(emb_arr, node_IDs, top_k, neighbor_file, threads=2, batch_size=5, completed_neighbors=None): '''docstring goes here ''' # set up threads log.writeln('1 | Thread initialization') all_indices = list(range(len(emb_arr))) if completed_neighbors: filtered_indices = [] for ix in all_indices: if not ix in completed_neighbors: filtered_indices.append(ix) all_indices = filtered_indices log.writeln(' >> Filtered out {0:,} completed indices'.format( len(emb_arr) - len(filtered_indices))) log.writeln(' >> Filtered set size: {0:,}'.format(len(all_indices))) #index_subsets = util.prepareForParallel(list(range(len(emb_arr))), threads-1, data_only=True) index_subsets = util.prepareForParallel(all_indices, threads - 1, data_only=True) nn_q = mp.Queue() nn_writer = mp.Process(target=_nn_writer, args=(neighbor_file, node_IDs, nn_q)) computers = [ mp.Process(target=_threadedNeighbors, args=(index_subsets[i], emb_arr, batch_size, top_k, nn_q)) for i in range(threads - 1) ] nn_writer.start() log.writeln('2 | Neighbor computation') util.parallelExecute(computers) nn_q.put(_SIGNALS.HALT) nn_writer.join()
def buildGraph(neighbor_files, k): log.writeln('Building neighborhood graph...') graph = {} # construct frequency-weighted edges log.track(message=' >> Loaded {0}/%d neighborhood files' % len(neighbor_files), writeInterval=1) for neighbor_file in neighbor_files: neighborhoods = readNeighbors(neighbor_file, k) for (source, neighbors) in neighborhoods.items(): if graph.get(source, None) is None: graph[source] = {} for nbr in neighbors: graph[source][nbr] = graph[source].get(nbr, 0) + 1 log.tick() log.flushTracker() log.writeln(' >> Normalizing edge weights...') max_count = float(len(neighbor_files)) for (source, neighborhood) in graph.items(): for (nbr, freq) in neighborhood.items(): graph[source][nbr] = freq/max_count log.writeln('Graph complete!') return graph
help='number of threads to use for parallel calculation (default: %default)', type='int', default=1) parser.add_option('--batch-size', dest='batch_size', help='number of samples to process in each batch (default: %default)', type='int', default=25) parser.add_option('--keys', dest='keysf', help='file listing keys to restrict NN analysis to') parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 2: parser.print_help() exit() embf, outf = args return embf, options.mode, options.keysf, outf, options.top_k, options.batch_size, options.threads, options.logfile embf, embf_mode, keysf, outf, top_k, batch_size, threads, logfile = _cli() if keysf: keys = readKeys(keysf) print("Read %d keys to restrict to" % len(keys)) else: keys = None t = log.startTimer('Reading embeddings...', newline=False) embeds = pyemblib.read(embf, mode=embf_mode, filter_to=keys, lower_keys=True) log.stopTimer(t, message='Done! Read %d embeddings ({0:.2f}s)' % len(embeds)) nearest_neighbors = calculateNearestNeighbors(embeds, outf, top_k=top_k, batch_size=batch_size, threads=threads) log.writeln('Wrote nearest neighbors to %s.' % outf)
def _build(self): self._input = tf.placeholder(dtype=tf.float32, shape=[None, 2, self.p.embedding_dim], name='embedding_pair_input') self._labels = tf.placeholder(dtype=tf.int32, shape=[None], name='labels') if self._debug: log.writeln(str(self._input)) log.writeln(str(self._labels)) _input = tf.Print(self._input, [self._input], summarize=_SUMMARIZE, message='Embedding input') labels = tf.Print(self._labels, [self._labels], summarize=_SUMMARIZE, message='Labels') else: _input = self._input labels = self._labels conv_filters = tf.Variable( tf.truncated_normal( [ 2, # filter height is always 2 self.p.filter_width, 1, self.p.num_filters ], #[2, 2, 1, self.p.num_filters], #[2, 2, 1, 1], stddev=0.5)) if self._debug: log.writeln(str(conv_filters)) conv_filters = tf.Print(conv_filters, [conv_filters], summarize=_SUMMARIZE, message='Convolutional filters') cnn = tf.nn.conv2d( input=tf.reshape(self._input, [-1, 2, self.p.embedding_dim, 1]), filter=conv_filters, #strides=[1, 1, 1, 1], #strides=[1, 1, 2, 1], strides=[1, self.p.filter_vstride, self.p.filter_hstride, 1], padding="SAME", name='CNN_op') if self._debug: log.writeln(str(cnn)) cnn = tf.Print(cnn, [cnn], summarize=_SUMMARIZE, message='CNN output') pooled = tf.nn.max_pool( value=cnn, #ksize=[1, 2, self.p.embedding_dim, 1], #strides=[1, 2, self.p.embedding_dim, 1], #ksize=[1, 2, 2, 1], ksize=[ 1, (self.p.filter_vstride % 2) + 1, # pool height is determined # by filter vstride self.p.pool_width, 1 ], #strides=[1, 2, 2, 1], strides=[ 1, (self.p.filter_vstride % 2) + 1, # always reduces to 1, self.p.pool_hstride, 1 ], padding='SAME', name='max_pooled_CNN') if self._debug: log.writeln(str(pooled)) pooled = tf.Print(pooled, [pooled], summarize=_SUMMARIZE, message='Max pooled CNN output') #pooled = tf.squeeze( # pooled, # #axis=[1,2] # axis=[1,3] #) pooled = tf.reshape( pooled, shape=[ -1, # batch_size ((self.p.embedding_dim // self.p.pool_hstride) * self.p.num_filters) ]) if self._debug: log.writeln(str(pooled)) pooled = tf.Print(pooled, [pooled], summarize=_SUMMARIZE, message='Squeezed pooled') pooled = tf.nn.dropout(pooled, 0.6, name='pooled_with_dropout') if self._debug: log.writeln(str(pooled)) pooled = tf.Print(pooled, [pooled], summarize=_SUMMARIZE, message='Dropout pooled') full = tf.contrib.layers.fully_connected(pooled, self.p.fully_connected_dim, activation_fn=tf.nn.relu) if self._debug: log.writeln(str(full)) full = tf.Print(full, [full], summarize=_SUMMARIZE, message='Fully connected output') output_layer = tf.contrib.layers.fully_connected( full, self.p.num_classes, #activation_fn=tf.nn.relu activation_fn=None) if self._debug: log.writeln(str(output_layer)) output_layer = tf.Print(output_layer, [output_layer], summarize=_SUMMARIZE, message='Output layer') loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self._labels, logits=output_layer, ) self._batch_loss = tf.reduce_sum(loss, ) self._scores = tf.nn.softmax(output_layer) self._predictions = tf.argmax(self._scores, axis=1) optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08) self._train_step = optimizer.minimize(loss)
def getStatistics(f1, f2): preds1 = readPredictions(f1) preds2 = readPredictions(f2) for ds in preds1.keys(): log.writeln(('\n\n{0}\n### %s\n{0}\n\n'.format('#' * 80)) % ds) (lbl_scores_1, gold_1) = preds1[ds] (lbl_scores_2, gold_2) = preds2[ds] (ab, ab_size) = correlation(lbl_scores_1, lbl_scores_2, '%s -- A vs B' % ds) (at, at_size) = correlation(lbl_scores_1, gold_1, '%s -- A vs GOLD' % ds) (bt, bt_size) = correlation(lbl_scores_2, gold_2, '%s -- B vs GOLD' % ds) log.writeln("\n -- %s Agreement summary --" % ds) log.writeln(" |r_bt - r_at| = %f" % abs(at - bt)) log.writeln(" r_ab = %f (%d)" % (ab, ab_size)) log.writeln(" r_at = %f (%d)" % (at, at_size)) log.writeln(" r_bt = %f (%d)" % (bt, bt_size))
'%s -- A vs B' % ds) (at, at_size) = correlation(lbl_scores_1, gold_1, '%s -- A vs GOLD' % ds) (bt, bt_size) = correlation(lbl_scores_2, gold_2, '%s -- B vs GOLD' % ds) log.writeln("\n -- %s Agreement summary --" % ds) log.writeln(" |r_bt - r_at| = %f" % abs(at - bt)) log.writeln(" r_ab = %f (%d)" % (ab, ab_size)) log.writeln(" r_at = %f (%d)" % (at, at_size)) log.writeln(" r_bt = %f (%d)" % (bt, bt_size)) if __name__ == '__main__': def _cli(): import optparse parser = optparse.OptionParser(usage='Usage: %prog LOG1 LOG2') parser.add_option('-l', '--logfile', dest='logfile') (options, args) = parser.parse_args() if len(args) != 2: parser.print_help() exit() return args, options.logfile (f1, f2), logfile = _cli() log.start(logfile=logfile, stdout_also=True) log.writeln('A: %s' % f1) log.writeln('B: %s' % f2) getStatistics(f1, f2)
def crossValidationSplits(dataset, n_folds, dev_size, persistent_path=None, random_seed=1, log=log): if persistent_path and os.path.isfile('%s.fold-0.train' % persistent_path): log.writeln('Reading pre-existing cross validation splits from %s.' % persistent_path) splits = readSplits(persistent_path, n_folds, id_cast=int) else: log.writeln('Generating cross-validation splits...') np.random.seed(random_seed) ids_by_class, classes = stratifyByClass(dataset) total_size = 0 for (lbl, ids) in ids_by_class.items(): total_size += len(ids) log.writeln(' Dataset size: {0:,}'.format(total_size)) log.writeln(' Number of classes: {0:,}'.format(len(classes))) # shuffle it for _class in classes: np.random.shuffle(ids_by_class[_class]) # figure out how many points of each class per fold fold_size_by_class, dev_size_by_class = getFoldAndDevSizeByClass( ids_by_class, n_folds, dev_size ) labeled_splits, id_splits = [], [] for i in range(n_folds): train_by_class = {} for _class in classes: train_by_class[_class] = [] for j in range(n_folds): fold_by_class = {} for _class in classes: fold_size = fold_size_by_class[_class] if j < (n_folds - 1): fold_by_class[_class] = ids_by_class[_class][j*fold_size:(j+1)*fold_size] else: fold_by_class[_class] = ids_by_class[_class][j*fold_size:] if j == i: test_by_class = fold_by_class.copy() else: for (_class, subset) in fold_by_class.items(): train_by_class[_class].extend(subset) # sample out dev data train_by_class, dev_by_class = subsampleDevByClass( train_by_class, dev_size_by_class ) # collapse train, dev, test to flat ID lists lbl_train, id_train = collapseFromByClass(train_by_class) lbl_dev, id_dev = collapseFromByClass(dev_by_class) lbl_test, id_test = collapseFromByClass(test_by_class) labeled_splits.append((lbl_train, lbl_dev, lbl_test)) id_splits.append((id_train, id_dev, id_test)) log.writeln(' Fold {0} -- Train: {1:,} Dev: {2:,} Test: {3:,}'.format( i+1, len(id_train), len(id_dev), len(id_test) )) if persistent_path: log.writeln('Writing cross validation splits to %s.' % persistent_path) writeSplits(labeled_splits, persistent_path) splits = id_splits log.writeln() return splits
def runModel(mentions, entity_embeds, ctx_embeds, minibatch_size, preds_file, debug=False, secondary_entity_embeds=None, entity_combo_method=None, using_mention=False, preds_file_detailed=None, preferred_strings=None, preds_file_polysemy=None, polysemy=None): entity_vocab, entity_arr = entity_embeds.toarray() ctx_vocab, ctx_arr = ctx_embeds.toarray() if secondary_entity_embeds: secondary_entity_vocab, secondary_entity_arr = secondary_entity_embeds.toarray( ) secondary_entity_arr_2 = [] for v in secondary_entity_vocab: secondary_entity_arr_2.append(np.array(secondary_entity_embeds[v])) secondary_entity_arr_2 = np.array(secondary_entity_arr_2) else: secondary_entity_vocab, secondary_entity_arr = None, None ent_ixer = Indexer(entity_vocab) ctx_ixer = Indexer(ctx_vocab) if secondary_entity_embeds: secondary_ent_ixer = Indexer(secondary_entity_vocab) else: secondary_ent_ixer = None max_num_entities = 0 for m in mentions: if len(m.candidates) > max_num_entities: max_num_entities = len(m.candidates) max_mention_size = 0 for m in mentions: n_tokens = len(m.mention_text.split()) if n_tokens > max_mention_size: max_mention_size = n_tokens window_size = 5 params = LLParams( ctx_vocab_size=len(ctx_vocab), ctx_dim=ctx_embeds.size, entity_vocab_size=len(entity_vocab), entity_dim=entity_embeds.size, secondary_entity_vocab_size=(0 if not secondary_entity_embeds else len(secondary_entity_vocab)), secondary_entity_dim=(0 if not secondary_entity_embeds else secondary_entity_embeds.size), window_size=window_size, max_num_entities=max_num_entities, max_mention_size=max_mention_size, entity_combo_method=entity_combo_method, using_mention=using_mention) session = tf.Session() lll = LinearSabbirLinkerC( session, np.array(ctx_arr), np.array(entity_arr), params, debug=debug, secondary_entity_embed_arr=np.array(secondary_entity_arr)) log.track(message=' >>> Processed {0} batches', writeInterval=10) if secondary_entity_embeds: ent_vs_sec = McNemars() ent_vs_joint = McNemars() sec_vs_joint = McNemars() joint_vs_oracle = McNemars() correct, total = 0., 0 batch_start = 0 oracle = {} while (batch_start < len(mentions)): next_batch_mentions = mentions[batch_start:batch_start + minibatch_size] next_batch = [ prepSample(mention, ent_ixer, ctx_ixer, window_size, max_mention_size, max_num_entities, secondary_ent_ixer=secondary_ent_ixer) for mention in next_batch_mentions ] batch_ctx_window_ixes = [ next_batch[i][0] for i in range(len(next_batch)) ] batch_ctx_window_masks = [ next_batch[i][1] for i in range(len(next_batch)) ] batch_mention_ixes = [next_batch[i][2] for i in range(len(next_batch))] batch_mention_masks = [ next_batch[i][3] for i in range(len(next_batch)) ] batch_entity_ixes = [next_batch[i][4] for i in range(len(next_batch))] batch_entity_masks = [next_batch[i][5] for i in range(len(next_batch))] if secondary_entity_embeds: batch_secondary_entity_ixes = [ next_batch[i][6] for i in range(len(next_batch)) ] else: batch_secondary_entity_ixes = None results = lll.getPredictions( batch_ctx_window_ixes, batch_ctx_window_masks, batch_entity_ixes, batch_entity_masks, batch_secondary_entity_ixes=batch_secondary_entity_ixes, batch_mention_ixes=batch_mention_ixes, batch_mention_masks=batch_mention_masks, oracle=True) if secondary_entity_embeds: (preds, probs, ent_preds, secondary_ent_preds) = results else: (preds, probs, ent_preds) = results for i in range(len(next_batch)): (_, _, _, _, ent_ixes, _, _, correct_candidate, mention) = next_batch[i] # base accuracy eval predicted_ix = ent_ixes[preds[i]] if predicted_ix == correct_candidate: correct += 1 total += 1 # oracle eval joint_correct, entity_correct, secondary_correct, oracle_correct = False, False, False, False if ent_ixes[ent_preds[i]] == correct_candidate: entity_correct = True oracle['entity_correct'] = oracle.get('entity_correct', 0) + 1 if secondary_entity_embeds and ent_ixes[ preds[i]] == correct_candidate: joint_correct = True oracle['joint_correct'] = oracle.get('joint_correct', 0) + 1 if secondary_entity_embeds and ent_ixes[ secondary_ent_preds[i]] == correct_candidate: secondary_correct = True oracle['secondary_correct'] = oracle.get( 'secondary_correct', 0) + 1 if entity_correct or secondary_correct: oracle_correct = True oracle['oracle_correct'] = oracle.get('oracle_correct', 0) + 1 # significance tracking if secondary_entity_embeds: # entity vs secondary if entity_correct and secondary_correct: ent_vs_sec.a += 1 elif entity_correct and (not secondary_correct): ent_vs_sec.b += 1 elif (not entity_correct) and secondary_correct: ent_vs_sec.c += 1 else: ent_vs_sec.d += 1 # entity vs joint if entity_correct and joint_correct: ent_vs_joint.a += 1 elif entity_correct and (not joint_correct): ent_vs_joint.b += 1 elif (not entity_correct) and joint_correct: ent_vs_joint.c += 1 else: ent_vs_joint.d += 1 # secondary vs joint if secondary_correct and joint_correct: sec_vs_joint.a += 1 elif secondary_correct and (not joint_correct): sec_vs_joint.b += 1 elif (not secondary_correct) and joint_correct: sec_vs_joint.c += 1 else: sec_vs_joint.d += 1 # joint vs oracle if joint_correct and oracle_correct: joint_vs_oracle.a += 1 elif joint_correct and (not oracle_correct): joint_vs_oracle.b += 1 elif (not joint_correct) and oracle_correct: joint_vs_oracle.c += 1 else: joint_vs_oracle.d += 1 # predictions + scores if preds_file: preds_file.write('Probs: [ %s ] Pred: %d -> %d Gold: %d\n' % (' '.join([str(p) for p in probs[i]]), preds[i], ent_ixes[preds[i]], correct_candidate)) # predictions + corpus polysemy of correct entity if preds_file_polysemy: try: line = '%d\t%f\n' % ( (1 if predicted_ix == correct_candidate else 0), polysemy[ent_ixer[predicted_ix]]) preds_file_polysemy.write(line) except KeyError: pass # predictions, in detail if preds_file_detailed: keys = ['all'] if secondary_entity_embeds: pred_ixes = [('Pred (Joint)', ent_ixes[preds[i]]), ('Pred (Ent)', ent_ixes[ent_preds[i]]), ('Pred (Defn)', ent_ixes[secondary_ent_preds[i]])] if entity_correct and secondary_correct: comp_stream_key = 'both_correct' elif entity_correct and (not secondary_correct): comp_stream_key = 'entity_only_correct' elif (not entity_correct) and secondary_correct: comp_stream_key = 'secondary_only_correct' else: comp_stream_key = 'both_wrong' keys.append(comp_stream_key) #if entity_correct and secondary_correct and joint_correct: # joint_stream_key = None #if entity_correct and secondary_correct and (not joint_correct): # joint_stream_key = 'ent_sec_no-joint' #elif entity_correct and joint_correct and (not secondary_correct): # joint_stream_key = 'ent_and_joint' #elif (not entity_correct) and joint_correct and secondary_correct: # joint_stream_key = 'sec_and_joint' #elif joint_correct and (not entity_correct) and (not secondary_correct): # joint_stream_key = 'joint_only' #elif entity_correct and (not joint_correct) and (not secondary_correct): # joint_stream_key = 'ent_no-joint' #elif (not entity_correct) and (not joint_correct) and secondary_correct: # joint_stream_key = 'sec_no-joint' #elif (not entity_correct) and (not joint_correct) and (not secondary_correct): # joint_stream_key = None #keys.append(joint_stream_key) if (not entity_correct) and joint_correct: keys.append('ent_joint_help') elif entity_correct and (not joint_correct): keys.append('ent_joint_hurt') if (not secondary_correct) and joint_correct: keys.append('sec_joint_help') if secondary_correct and (not joint_correct): keys.append('sec_joint_hurt') else: pred_ixes = [('Pred', predicted_ix)] if entity_correct: stream_key = 'entity_correct' else: stream_key = 'entity_wrong' keys.append(stream_key) for k in keys: _writeDetailedOutcome(preds_file_detailed[k], mention, probs, batch_entity_ixes, batch_entity_masks, ent_ixer, preferred_strings, correct_candidate, pred_ixes, i) batch_start += minibatch_size log.tick() log.flushTracker() for (msg, mcn) in [('Entity vs Defn', ent_vs_sec), ('Entity vs Joint', ent_vs_joint), ('Defn vs Joint', sec_vs_joint), ('Joint vs Oracle', joint_vs_oracle)]: chi2, pval = mcn.run() log.writeln('\n%s\n' ' | a = %5d | b = %5d |\n' ' | c = %5d | d = %5d |\n' ' Chi^2 = %f P-value = %f\n' % (msg, mcn.a, mcn.b, mcn.c, mcn.d, chi2, pval)) return correct, total, oracle
log.start(logfile=options.logfile, stdout_also=True) configlogger.writeConfig(output=log, settings=[ ('Dataset', options.mode), ('Using skip indices', ('None' if not options.skips_f else options.skips_f)), ('Embedding settings', em.logCLIOptions(options)), ('Scoring settings', OrderedDict([ ('Combination of entity and string', options.use_combo), ('Cross comparison of entity/string', options.use_cross), ('Cross comparison only', options.cross_only), ('Using mean of scores instead of sum', options.use_mean) ])), ], title='Similarity/Relatedness experiment') if not options.use_combo: log.writeln('\nMode: %s Method: %s\n' % (options.mode, em.name(options.repr_method))) separator = '\t' if options.tab_sep else ' ' emb_wrapper = em.getEmbeddings(options, log=log, separator=separator) else: log.writeln('\nMode: %s Method: COMBO\n' % options.mode) ent_embf, word_embf = options.ent_embf, options.word_embf separator = '\t' if options.tab_sep else ' ' options.repr_method = em.ENTITY options.word_embf = None ent_emb_wrapper = em.getEmbeddings(options, log=log, separator=separator) options.repr_method = em.WORD options.ent_embf = None options.word_embf = word_embf str_emb_wrapper = em.getEmbeddings(options, log=log, separator=separator)
configlogger.writeConfig(log, [ ('SemCor', [ ('XML', config['SemCor']['XML']), ('Labels', config['SemCor']['Labels']), ]), ('Output file', config['SemCor']['Lemmas']), ]) t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML']) (sentences_words, sentences_instances) = wsd_parser.processSentences( config['SemCor']['XML'], get_lemmas=True) log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format( len(sentences_words), '{0:.2f}')) log.writeln('Collecting set of SemCor lemmas...') lemmas = set() for sentence_instances in sentences_instances: for (instance_ID, ix, lemma) in sentence_instances: lemmas.add(lemma) log.writeln('Found {0:,} distinct lemmas.\n'.format(len(lemmas))) log.writeln('Writing list of lemmas to %s...' % config['SemCor']['Lemmas']) with codecs.open(config['SemCor']['Lemmas'], 'w', 'utf-8') as stream: for lemma in lemmas: stream.write('%s\n' % lemma) log.writeln('Done.\n') log.stop()
def ELMoBaseline(mentions, mention_map, backoff_preds, training_lemmas, semcor_embeddings, output_predsf): log.writeln('Running ELMo baseline\n') # pre-norm the semcor embeddings log.writeln('Norming SemCor embeddings...') normed_semcor_embeddings = pyemblib.Embeddings() for (k, v) in semcor_embeddings.items(): normed_semcor_embeddings[k] = (v / np.linalg.norm(v)) #semcor_embeddings = normed_semcor_embeddings ordered_vocab, semcor_embeddings = normed_semcor_embeddings.toarray() semcor_embeddings = np.transpose(semcor_embeddings) log.writeln('Done.\n') predictions, correct = [], 0 num_elmo, num_backoff = 0, 0 log.track( message=' >> Processed {0:,}/%s samples ({1:,} ELMo, {2:,} backoff)' % ('{0:,}'.format(len(mentions))), writeInterval=5) for m in mentions: (ds, instance_ID, lemma) = mention_map[m.ID] if lemma in training_lemmas: #prediction = getNearestNeighborKey(m.context_repr, semcor_embeddings) prediction = getNearestNeighborKey2(m.context_repr, semcor_embeddings, ordered_vocab) num_elmo += 1 else: prediction = backoff_preds[predictionID(ds, instance_ID)] num_backoff += 1 predictions.append((m.ID, prediction)) if prediction == m.CUI: correct += 1 log.tick(num_elmo, num_backoff) log.flushTracker(num_elmo, num_backoff) writeWSDFrameworkPredictions(predictions, mention_map, output_predsf) log.writeln('\n-- ELMo baseline --') log.writeln('Accuracy: {0:.4f} ({1:,}/{2:,})\n'.format( float(correct) / len(predictions), correct, len(predictions))) log.writeln('# ELMo: {0:,}\n# backoff: {1:,}\n'.format( num_elmo, num_backoff))
if (not options.inputf) or (not options.outputf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input file', options.inputf), ('Output file', options.outputf), ('# samples per class', options.size), ('Random seed', options.random_seed), ], 'WordNet dataset subsampling') log.writeln('Reading dataset from %s...' % options.inputf) ds = dataset.load(options.inputf) log.writeln('Read {0:,} samples.\n'.format(len(ds))) log.writeln('Collating by class...') collated = collateByClass(ds) classes = list(collated.keys()) classes.sort() for c in classes: log.writeln(' {0} --> {1:,}'.format(c, len(collated[c]))) if len(collated[c]) < options.size: log.writeln( '[WARNING] subsample size is too large for class "{0}"'.format( c)) log.writeln('\nSubsampling...')
('Output predictions file', options.elmo_baseline_eval_predictions), ('SemCor embeddings', options.semcor_embf), ('Training lemmas file', options.training_lemmasf), ('Pre-calculated WN first sense backoff predictions', options.wordnet_baseline_input_predictions), ]), ], title="ELMo WSD baselines replication") t_sub = log.startTimer('Reading mentions from %s...' % mentionf, newline=False) mentions = mention_file.read(mentionf) log.stopTimer(t_sub, message='Read %d mentions ({0:.2f}s)' % len(mentions)) log.writeln('Reading mention dataset data from %s...' % options.mention_mapf) mention_map = dataset_map_utils.readDatasetMap(options.mention_mapf, get_IDs=True, get_lemmas=True) log.writeln('Mapped dataset info for {0:,} mentions.\n'.format( len(mention_map))) if options.wordnet_baseline_eval_predictions: wordnetFirstSenseBaseline(mentions, mention_map, options.wordnet_baseline_eval_predictions) if options.elmo_baseline_eval_predictions: log.writeln('Reading set of training lemmas from %s...' % options.training_lemmasf) training_lemmas = readTrainingLemmas(options.training_lemmasf) log.writeln('Read {0:,} lemmas.\n'.format(len(training_lemmas)))
parser.add_option('-k', dest='k', help='number of neighbors to use for edge construction (default: %default)', type='int', default=10) parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) == 0: parser.print_help() exit() neighbor_files = args return neighbor_files, options neighbor_files, options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ *[ ('Neighborhood sample file %d' % (i+1), neighbor_files[i]) for i in range(len(neighbor_files)) ], ('Output file', options.outputf), ('Number of neighbors to include in edge construction', options.k), ], 'Nearest neighborhood graph generation') graph = buildGraph(neighbor_files, options.k) log.write('Writing graph to %s...' % options.outputf) writeGraph(graph, options.outputf) log.writeln('Done!') log.stop()
'-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 3: parser.print_help() exit() if not options.logfile: options.logfile = '%s.analysis.log' % args[2] return args, options.logfile (resultsf, polysemyf, outf), logfile = _cli() log.start(logfile=logfile, stdout_also=True) log.writeln('Running sim/rel error analysis') log.writeln(' Results file: %s' % resultsf) log.writeln(' Polysemy file: %s' % polysemyf) log.writeln(' Output files: %s' % outf) results = readResults(resultsf) polysemy = readPolysemy(polysemyf) addPolysemy(results, polysemy) for (dataset, dataset_res) in results.items(): log.writeln('\nDataset: %s' % dataset) outfile = '%s.%s.tsv' % (outf, dataset) writePolyErrors(dataset_res, outfile) log.writeln(' Wrote errors w/ polysemy to: %s' % outfile) (coefs, intercept, r_sq) = runRegression(dataset_res)
return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()} log.writeln('Writing remapped embeddings to %s...' % options.outputf) (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format) pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True) log.writeln('Done!') log.stop()
def crossfoldTrain(src_embs, trg_embs, pivot_keys, nfold, activation, num_layers, batch_size=5, checkpoint_file='checkpoint', random_seed=None): project_batch_size = batch_size * 10 pivot_keys = list(pivot_keys) if random_seed: random.seed(random_seed) random.shuffle(pivot_keys) fold_size = int(np.ceil(len(pivot_keys) / nfold)) mapped_embs = {} src_keys = list(src_embs.keys()) for k in src_keys: mapped_embs[k] = np.zeros([trg_embs.size]) session = tf.Session() params = MapperParams(src_dim=src_embs.size, trg_dim=trg_embs.size, map_dim=trg_embs.size, activation=activation, num_layers=num_layers, checkpoint_file=checkpoint_file) for i in range(nfold): log.writeln(' Starting fold %d/%d' % (i + 1, nfold)) if random_seed: this_random = random_seed + i else: this_random = None model = ManifoldMapper(session, params, random_seed=this_random) fold_start, fold_end = (i * fold_size), ((i + 1) * fold_size) train_keys = pivot_keys[:fold_start] dev_keys = pivot_keys[fold_start:fold_end] train_keys.extend(pivot_keys[fold_end:]) train(model, src_embs, trg_embs, train_keys, dev_keys, batch_size=batch_size) # get projections from this fold log.writeln(' Getting trained projections for fold %d' % (i + 1)) log.track(message=' >> Projected {0}/%d keys' % len(src_keys), writeInterval=10000) batch_start = 0 while batch_start < len(src_keys): batch_keys = src_keys[batch_start:batch_start + project_batch_size] batch_src = np.array([src_embs[k] for k in batch_keys]) batch_mapped = model.project_batch(batch_src) for i in range(batch_mapped.shape[0]): key = batch_keys[i] mapped_embs[key] += batch_mapped[i] log.tick() batch_start += project_batch_size log.flushTracker() # mean projections for k in src_keys: mapped_embs[k] /= nfold # get final MSE over full pivot set final_errors = [] for k in pivot_keys: diff = mapped_embs[k] - trg_embs[k] final_errors.append(np.sum(diff**2) / 2) log.writeln('\nPivot error in final projections: %f' % np.mean(final_errors)) return mapped_embs
log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(word_embeds)) t_sub = log.startTimer('Reading entity definitions from %s...' % options.entity_defnf, newline=False) definitions = readDefinitions(options.entity_defnf) log.stopTimer(t_sub, message='Read %d definitions ({0:.2f}s)' % len(definitions)) log.write('Constructing entity definition representations...') entity_defn_embeds = embedDefinitions(definitions, word_embeds) #del(word_embeds) log.writeln('Embedded %d entity definitions.' % len(entity_defn_embeds)) if options.entity_dualf: dual_embeds = pyemblib.Embeddings() for (k, v) in entity_defn_embeds.items(): if k in entity_embeds: dual_embeds[k] = np.concatenate([entity_embeds[k], v]) log.writeln('Writing both versions of entity embeddings to %s...' % options.entity_dualf) pyemblib.write(dual_embeds, options.entity_dualf) log.writeln('Wrote %d dual embeddings.' % len(dual_embeds)) else: entity_defn_embeds = None if options.stringsf: t_sub = log.startTimer('Reading preferred strings from %s...' %
('Ordered vocabulary file', options.vocabf), ('Number of nearest neighbors', options.k), ('Batch size', options.batch_size), ('Number of threads', options.threads), ('Partial nearest neighbors file for resuming', options.partial_neighbors_file), ], 'k Nearest Neighbor calculation with cosine similarity') t_sub = log.startTimer('Reading embeddings from %s...' % embf) emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace') log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(emb), '{0:.2f}')) if not os.path.isfile(options.vocabf): log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf) writeNodeMap(emb, options.vocabf) else: log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf) node_map = readNodeMap(options.vocabf) # get the vocabulary in node ID order, and map index in emb_arr # to node IDs node_IDs = list(node_map.keys()) node_IDs.sort() ordered_vocab = [node_map[node_ID] for node_ID in node_IDs] emb_arr = np.array([emb[v] for v in ordered_vocab]) if options.partial_neighbors_file:
log.start(logfile=options.logfile) config = configparser.ConfigParser() config.read(options.config) analogy_file = datasets.getpath(options.dataset, config, eval_mode.ALL_INFO) configlogger.writeConfig(log, settings=[ ('Config file', options.config), ('Dataset', options.dataset), ('Path to dataset', analogy_file), ('Lowercasing analogies', options.to_lower), ('Output vocab file', vocabf), ], title='Vocabulary extraction from analogy dataset') log.writeln('Reading %s analogies from %s...' % (options.dataset, analogy_file)) analogies = parsers.parse( analogy_file, options.dataset, eval_mode.ALL_INFO, data_mode.String, to_lower=options.to_lower ) log.writeln('Read {0:,} analogies in {1:,} relations.\n'.format( sum([len(anlg_set) for anlg_set in analogies.values()]), len(analogies) )) log.writeln('Extracting vocabulary...') vocab = set() for (_, anlg_set) in analogies.items():
mode=options.src_embf_mode, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings in {0:.2f}s' % len(src_embs)) t_sub = log.startTimer('Reading target embeddings from %s...' % options.trg_embf, newline=False) trg_embs = pyemblib.read(options.trg_embf, mode=options.trg_embf_mode, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings in {0:.2f}s' % len(trg_embs)) pivots = readPivotsFile(options.pivotf, tolower=True) log.writeln('Loaded %d pivot terms.' % len(pivots)) # double check that pivots are present in both embedding files validated_pivots = set() for pivot in pivots: if (not pivot in src_embs) or (not pivot in trg_embs): log.writeln( '[WARNING] Pivot term "%s" not found in at least one embedding set' % pivot) else: validated_pivots.add(pivot) # write the experimental configuration configlogger.writeConfig('%s.config' % options.checkpointf, title='DNN embedding mapping experiment', settings=[