def compute_nearest_neighbours(definitions, abstracts): """ Compute nearest neighbours from abstracts to definitions. Parameters ---------- definitions : dictionary of dictionaries. A dictionary of dictionaries containing vectors. The top key is the Ambiguous term, the bottom key is the CUI. Example: {AMBIGTERM: {CUI1: VECTOR, CUI2: VECTOR}} abstracts : dictionary of dictionaries Like definitions. Returns ------- result : dict A dictionary, the keys of which are the ambiguous terms, and the values are lists of tuples. The first item of each tuple is the true class, the second item of each tuple is the predicted class. example: {AMBIGTERM1: [(y1, y_pred1), (y2, y_pred2), ...]} """ output = {} for k, v in abstracts.items(): results = [] labels, vectors = dict_to_tuple(v) try: targets, matrix = dict_to_tuple(definitions[k]) except KeyError: continue matrix = Reach.normalize(np.asarray(matrix)) vectors = Reach.normalize(np.asarray(vectors)) for vec in vectors: result = -vec.dot(matrix.T) results.append(targets[np.argsort(result)[0]]) output[k] = list(zip(labels, results)) return output
def extract_online_lstm_embeddings(self, prune=False, normalize=True, verbose=False, provided_names=(), preprocess=False): self.model.eval() if provided_names: input_items = provided_names if preprocess: input_items = [self.preprocess(name) for name in input_items] else: embeddings = deepcopy(self.sampling.pretrained_name_embeddings) if prune: names_to_prune = set( self.sampling.exemplar_to_concept.keys()).union( self.sampling.validation_references.keys()) embeddings.prune(names_to_prune) input_items = [x for _, x in sorted(embeddings.indices.items())] # batch input items to save up on memory... all_embeddings = [] batch_size = 500 if self.hidden_size >= 9600 else 1000 for i in tqdm(range(0, len(input_items), batch_size), disable=not verbose): input_batch = input_items[i:i + batch_size] input_vectors = [] for item in input_batch: vector = self.sampling.vectorize_string(item, norm=normalize) input_vectors.append(torch.FloatTensor(vector).to(self.device)) # pass through LSTM network lstm_embeddings = self.forward_lstm(input_vectors) online_batch = lstm_embeddings.detach().cpu().numpy() # add batch all_embeddings.append(online_batch) # convert to embeddings all_embeddings = np.concatenate(all_embeddings) online_embeddings = Reach(all_embeddings, input_items) return online_embeddings
def __init__(self, detection_list, language, model, k, backoff, pathtofrequencies, pathtomodel, pathtovectors): """ :param detection_list: list with tuples containing (misspelling, list of 10 left context tokens, list of 10 right context tokens) :param language: 1 if English, 0 if Dutch :param model: 1 if context-sensitive, 0 if noisy channel :param k: number of ranked corrections returned """ # prepare model print('Initializing spelling correction model...') assert len(detection_list[0]) == 3, 'Wrong input format' self.misspellings, self.left_contexts, self.right_contexts = zip( *detection_list) assert len(self.misspellings) == len(self.left_contexts) == len( self.right_contexts), 'Input data not properly synchronized' print(len(self.misspellings), 'misspellings to correct') self.ranking_model = model assert self.ranking_model in range( 2), 'No valid correction model specified' assert k >= 1, 'No valid k specified' self.k = k self.backoff = backoff if language == 1: self.language = 'en' elif language == 0: self.language = 'nl' else: raise ValueError('No valid language input specified') # load embedding model and corpus frequencies with open(pathtofrequencies, 'r') as f: self.frequency_dict = json.load(f) self.model = fasttext.load_model(pathtomodel) self.r = Reach.load(pathtovectors, header=True) # set parameters for correction if self.language == "en": self.window_size = 9 self.oov_penalty = 1.7 elif self.language == "nl": self.window_size = 10 self.oov_penalty = 2.4 print('Model initialized')
def get_grouped_questions(self, trainSet, simThreshold): grouped_questions = defaultdict( list ) #{id:[list of similar questions, where each item is a list of covered tokens in the question]} questions_type = defaultdict(lambda: defaultdict(int)) grouped_questions_cat = defaultdict(set) for d in trainSet: cur_segment = self.segmenter.segment(d.getTextObject()) for qap in cur_segment: qid = len(grouped_questions.keys()) cur_q_tokens = d.getTextObject().get_covered_tokens( qap.begQue, qap.endQue) if any(cur_q_tokens in val for val in grouped_questions.values()): continue qVec = Resources.getWordVectors().vectorize(cur_q_tokens, remove_oov=True) if not qVec: continue norm_q_vec = Reach.normalize(np.mean(qVec, axis=0)) k = self.get_grouped_qid(norm_q_vec, grouped_questions, simThreshold) if k is not None: qid = k grouped_questions[qid].append(cur_q_tokens) ansType, cat = self.get_ans_type(qap.answers) if not ansType: continue questions_type[qid][ansType] += 1 if cat: grouped_questions_cat[qid].add(cat) return (grouped_questions, questions_type, grouped_questions_cat)
def create_concepts(concepts, embeddings, include_np=True, labels=None): """Create concepts by summing over descriptions in embedding spaces.""" # Gold standard labels for concepts: concept_names = [] vectors = [] for name, descriptions in tqdm(list(concepts.items())): if labels is not None: try: label = sty[name] except KeyError: continue if not include_np and label == "np": continue concept = [] for idx, desc in enumerate(descriptions): try: desc = desc.lower().split() # desc = [x for x in desc if x not in STOP_WORDS] vec = embeddings.vectorize(desc, remove_oov=True) if not np.any(vec): continue concept.append(np.mean(vec, axis=0)) except ValueError: pass if not concept: continue concept_names.append(name) vectors.append(np.array(concept).mean(axis=0)) r = Reach(np.array(vectors), concept_names) return r
def extract_online_dan_embeddings(self, prune=False, normalize=True, verbose=False, provided_names=(), preprocess=False): self.model.eval() if provided_names: input_items = provided_names if preprocess: input_items = [self.preprocess(name) for name in input_items] embeddings = self.sampling.create_reach_object(input_items) else: embeddings = deepcopy(self.sampling.pretrained_name_embeddings) if prune: names_to_prune = set( self.sampling.exemplar_to_concept.keys()).union( self.sampling.validation_references.keys()) embeddings.prune(names_to_prune) input_vectors = embeddings.norm_vectors if normalize else embeddings.vectors input_items = [x for _, x in sorted(embeddings.indices.items())] # batch input items to save up on memory... all_embeddings = [] batch_size = 1000 for i in tqdm(range(0, len(input_items), batch_size), disable=not verbose): input_batch = input_vectors[i:i + batch_size] input_tensor = torch.FloatTensor(input_batch).to(self.device) online_batch = self.model(input_tensor).detach().cpu().numpy() all_embeddings.append(online_batch) all_embeddings = np.concatenate(all_embeddings) online_embeddings = Reach(all_embeddings, input_items) return online_embeddings
def frequency_baseline(self, detection_list, candidates_list): """ Majority frequency baseline :param detection_list: list of misspellings :param candidates_list: list of candidate list per misspelling :return: list with corrections or k-best corrections """ correction_list = [] print("Loading vector representations") r = Reach.load(self.pathtovectors, header=True) print("Done") for misspelling, candidates in zip(detection_list, candidates_list): candidates = [ candidate for candidate in candidates if candidate in self.frequency_dict.keys() ] frequencies = [ self.frequency_dict[candidate] for candidate in candidates ] if self.k == 1: try: correction_list.append(candidates[np.argmax(frequencies)]) except ValueError: correction_list.append('') elif self.k > 1: correction_list.append([ candidates[i] for i in np.argsort(frequencies)[::-1][:self.k] ]) else: raise ValueError('k must be positive natural number') return correction_list
def create_cluster_prototypes(self, provided_embeddings=None, total=False, pretrained=True): if provided_embeddings != None: embeddings = provided_embeddings else: if pretrained: embeddings = self.pretrained_name_embeddings else: embeddings = self.extract_online_dan_embeddings(prune=False) clusters = self.clusters if total else self.training_clusters print('Creating cluster prototypes...') cluster_prototypes = {} for label, strings in clusters.items(): strings = set(strings).intersection(self.training_names) cluster_prototypes[label] = self.create_prototype( strings, embeddings) items, vectors = zip(*cluster_prototypes.items()) self.cluster_prototypes = Reach(vectors, items)
def tune_oov(devcorpus, candidates_list, best_parameters, language): """ Conduct search for best oov penalty for corpus :param devcorpus: devcorpus generated with make_devcorpus.py :param candidates_list: list of candidate list per misspelling :param best_parameters: best parameters for the devcorpus :param language: language from ["en", "nl"] :return: dictionary with oov penalties as keys and their correction accuracy as values """ dev = Development(best_parameters, language) print("Loading embeddings") r = Reach.load(dev.pathtovectors, header=True) print("Done") corrected_list = devcorpus[0] detection_list = devcorpus[1] detection_contexts = devcorpus[2] scores_dict = {} values = list(range(30)) values = [value / 10 for value in values] for value in values: dev.oov_penalty = value correction_list = dev.ranking_experiment(detection_list, detection_contexts, candidates_list, r) accuracy = len([ c for i, c in enumerate(correction_list) if c == corrected_list[i] ]) / len(correction_list) scores_dict[value] = accuracy return scores_dict
if __name__ == "__main__": import logging import time import json # Setup # logging.basicConfig(level=logging.INFO) umls = "sample_data/umls_sample.json" msh = "sample_data/abstracts_example.json" path_to_embeddings = "" use_subset = False # Be sure to set add_unk to True, or to mark the UNK index. embeddings = Reach.load(path_to_embeddings, header=True, unk_word="UNK") logging.info("loaded embeddings.") start = time.time() y = Yarn(embeddings) umls = json.load(open(umls)) msh = json.load(open(msh)) if use_subset: subset = [ u'di', u'tat', u'erp', u'ori', u'crna', u'pep', u'de', u'hip', u'glycoside', u'sterilization', u'ra', u'don', u'ecg', u'cell',
# stores a list of row numbers and argument strings per verb. import re import numpy as np from reach import Reach import transformargs #pathnames rowspath = './cooccurrence/weighted_sm.rows' embeddingspath = './tulkens-embeddings/160/sonar-160.txt' logpath = './failedwords.txt' exportpath = './verbtrainingindex2' #import data rowsfile = open(rowspath, 'r', encoding='utf-8') r = Reach(embeddingspath, header=True) #holmatrix = np.load(holmatrixpath) #load output file log = open(logpath, 'w', encoding='utf-8') control = np.zeros(160) failedcount = 0 rowcount = 590408 t = transformargs.Transformer() verbarray = np.array( ['', np.array([np.array([0, ''], object)], object)], object ) #will contain line indexes and corresponding argument strings for each verb #dummy first row added to show structure verbindex = 0 oldkey = ''
"""Test with word embeddings.""" from reach import Reach from plate.plate import circular_convolution, decode if __name__ == "__main__": r = Reach.load("PATH_TO_EMBEDDINGS") # Encode "dog chase cat" a = circular_convolution(r["subject"], r["dog"]) b = circular_convolution(r["verb"], r["chase"]) c = circular_convolution(r["object"], r["cat"]) sentence = a + b + c vec = decode(r["subject"], sentence) result = r.nearest_neighbor(vec) # The top result should be dog
scores = {} gold = json.load(open("data/beth_gold.json")) gold = list(zip(*sorted(gold.items())))[1] txt, gold_chunks = zip(*gold) data = json.load(open("data/beth_uima.json")) data = list(zip(*sorted(data.items())))[1] # Sanity check for a, b in zip(data, gold): assert len(a[0]) == len(b[0]) embeddings = Reach.load("../../corpora/mimic_vecs_200_cbow.vec", unk_word="UNK") scores = {} focus = experiment(data, gold_chunks, np.mean, np.mean, embeddings, reciprocal, 0, k=100, use_focus=True) full = experiment(data, gold_chunks,
if __name__ == "__main__": # Set this flag to true to replicate the perfect chunking setting # in experiment 3. perfect = True gold = json.load(open("data/test_gold.json")) gold = list(zip(*sorted(gold.items())))[1] if perfect: data = json.load(open("data/test_gold.json")) data = list(zip(*sorted(data.items())))[1] txt, gold_bio = zip(*gold) r = Reach.load("../../corpora/mimiciii-min5-neg3-w5-100.vec", unk_word="<UNK>") r_concept = Reach.load_fast_format(f"data/concept_vectors") concept_labels = json.load(open("data/names2label.json")) grouped = defaultdict(list) for k, v in concept_labels.items(): grouped[v].append(r_concept[k]) grouped.pop("np") memory = {} for k, v in tqdm(grouped.items()): km = KMeans(10) km.fit(v)
import json from cat.simple import get_scores, rbf_attention from reach import Reach from collections import defaultdict GAMMA = .03 N_ASPECT_WORDS = 200 if __name__ == "__main__": scores = defaultdict(dict) r = Reach.load("embeddings/my_word_vectors.vec", unk_word="<UNK>") aspects = [[x] for x in json.load(open("data/aspect_words.json"))] aspects = aspects[:N_ASPECT_WORDS] instances = ["text_1".split(), "text_2".split()] label_set = {"label1", "label2", "label3"} s = get_scores(instances, aspects, r, label_set, gamma=GAMMA, remove_oov=False, attention_func=rbf_attention) pred = s.argmax(1)
def load_test_vectors(self, embeddings_infile): # load vectors print('Loading vectors...') self.test_vectors = Reach.load_fast_format(embeddings_infile)
def __init__(self, river, len_dang_arcs, fast): # Initialize the variables self.array_skeleton = [] self.row = 0 self.col = 0 self.length_dangling_arcs = len_dang_arcs self.array_Junction = [] self.array_done = [] self.list_Junction = [] self.l_Reach = [] self.flag_fast = fast # Find the skeleton and extract the river boundary # By the end of this constructor function we have a skeleton of the river with no dangling arcs, with junction points and reaches identified # 1. Get the river image in the form of numpy array ------------------- array_Image = river.getRiver() # 2. Find skeleton for the river -------------------------------------- print("Finding the skeleton") self.array_skeleton = morphology.skeletonize(array_Image > 0) self.array_skeleton = npy.array(self.array_skeleton, dtype=npy.uint8) (self.row, self.col) = self.array_skeleton.shape # --------------------------------------------------------------------- # image after taking the skeleton of river plt.imshow(self.array_skeleton) plt.savefig('temp/10_Skeleton.png', format='jpg', dpi=1200) # --------------------------------------------------------------------- # 3. Remove dangling arcs --------------------------------------------- if not fast: print("Removing dangling arcs") self.RemoveDanglingArc() # ----------------------------------------------------------------- # image after removing dangling arcs form the skeleton plt.imshow(self.array_skeleton) plt.savefig('temp/11_RemoveDanglingArc.png', format='jpg', dpi=1200) # ----------------------------------------------------------------- # 4. Find the junction points ----------------------------------------- print("Identifying all the junctions") self.array_done = npy.zeros((self.row, self.col), dtype=npy.int) self.array_Junction = npy.zeros((self.row, self.col), dtype=npy.int) self.MarkJunctions() # --------------------------------------------------------------------- # image after finding the junctions of the river plt.imshow(self.array_Junction) plt.savefig('temp/12_Junctions.png', format='jpg', dpi=1200) # --------------------------------------------------------------------- # 5. Identify all the reaches ----------------------------------------- print("Identifying all the reaches") reach = Reach(0) # create a new Reach type variable with ReachID = 0 self.l_Reach.append(reach) # append it to the list - l_Reach self.IdentifyReach() # self.MarkJunctionsAndNeighbourhood() # self.MarryReachJunction() return
if __name__ == "__main__": import logging import time import json # Setup # logging.basicConfig(level=logging.INFO) umls = "sample_data/umls_sample.json" msh = "sample_data/abstracts_example.json" path_to_embeddings = "" use_subset = False # Be sure to set add_unk to True, or to mark the UNK index. embeddings = Reach.load(path_to_embeddings, unk_word="UNK") logging.info("loaded embeddings.") start = time.time() y = Yarn(embeddings) umls = json.load(open(umls)) msh = json.load(open(msh)) if use_subset: subset = [u'di', u'tat', u'erp',
import sklearn.preprocessing import math import matplotlib.pyplot as plt from tqdm import tqdm_notebook as tqdm import pandas as pd #pathnames indexpath = './verbtrainingindex_withweights.npy' holmatrixpath = './cooccurrence/svd/newmatrix.npy' embeddingspath = './tulkens-embeddings/160/sonar-160.txt' outputpath = './verbmatrices/version3' #import data index = np.load(indexpath) holmatrix = np.load(holmatrixpath) arg_data = Reach(embeddingspath, header=True) #parameters n_dim = 160 s_dim = 200 alpha_value = 50 min_sample_size = 400 # note how in testing, a samplesize of N = 500 was deemed acceptable. here we # do not split in train an test data, so the min sample size can be 80% of the # one used when testing. variance_control = True mean_std = 0.08 matrices = dict() #loop through verbs
def noisychannel_ranking(self, detection_list, candidates_list): """ An approximate implementation of the ranking method described in (Lai et al. 2015) :param detection_list: list of misspellings :param candidates_list: list of candidate list per misspelling :param frequency_dict: corpus frequencies from training data :param k_best: if True, return k highest ranked candidates instead of single one :return: list with corrections or k-best corrections """ correction_list = [] confidences = [] print("Loading vector representations") r = Reach.load(self.pathtovectors, header=True) print("Done") for misspelling, candidates in zip(detection_list, candidates_list): # candidates = [candidate for candidate in candidates if candidate in r.words.keys()] score_list = [] for candidate in candidates: orthographic_edit_distance = damerau_levenshtein_distance( misspelling, candidate) phonetic_edit_distance = damerau_levenshtein_distance( dm(misspelling)[0], dm(candidate)[0]) spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance)**2 # P(m|c) try: frequency = self.frequency_dict[candidate] except KeyError: frequency = 1 frequency_score = 1 / (1 + log(frequency)) # P(c) score = spell_score * frequency_score # P(c|m) = P(m|c)*P(c) score_list.append(score) score_list = np.array(score_list) if len(score_list) > 1: sorted_distances = [ score_list[i] for i in np.argsort(score_list) ] top1 = sorted_distances[0] top2 = sorted_distances[1] confidence = abs(top1 - top2) / top1 confidences.append(confidence) else: confidences.append(0) if self.k == 1: try: correction_list.append(candidates[np.argmin(score_list)]) except ValueError: correction_list.append('') elif self.k > 1: correction_list.append( [candidates[i] for i in np.argsort(score_list)[:self.k]]) else: raise ValueError('k must be positive natural number') self.confidences = confidences return correction_list
import tensorflow as tf import re from reach import Reach r = Reach.load('./tulkens-embeddings/160/sonar-160.txt', header=True) objsize = 160 holisticsize = 10 #%% #import list of verbs verb = 'overleef' print('Verb:', verb) #create tensor for verb verbtens = tf.Variable(tf.random_uniform([objsize, holisticsize], 0.0, 1.0)) inp = tf.placeholder(tf.float32, [objsize]) outp = tf.matmul(verbtens, inp) sess = tf.Session() #get VO-combinations list combos = [] rowsfile = open('./cooccurrence/rows1.rows', 'r') done = False found = False while done == False: line = rowsfile.readline() if line.startswith(verb): found = True combos.append(line) else:
if __name__ == "__main__": import logging import time import json # Setup logging.basicConfig(level=logging.INFO) umls = "sample_data/umls_sample.json" msh = "sample_data/abstracts_example.json" path_to_embeddings = "" use_subset = False embeddings = Reach(path_to_embeddings, header=True, verbose=False) logging.info("loaded embeddings.") start = time.time() y = Yarn(embeddings) umls = json.load(open(umls)) msh = json.load(open(msh)) if use_subset: subset = [ u'di', u'tat', u'erp', u'ori', u'crna', u'pep', u'de', u'hip', u'glycoside', u'sterilization', u'ra', u'don', u'ecg', u'cell',
def compose(documents, f1, f2, embeddings, window, context_function, use_focus=True, norm=False): """ Map phrases from sentences to vectors. Parameters ========== documents : list of lists A list of lists, where each sublist contains 2 lists of the same length, where the first list contains the tokens of a text, and the second list contains the BIO of the NP chunks for said text. f1 : function A function which is used to compose the vectorized lists of words into a single vector. Must take an axis parameter. f2 : function A function which is used to compose the vectors vectorized with f1 into a second-order vector. Must also take an axis parameter. embeddings : Reach A reach instance which contains the embeddings you want to use to vectorize. window : int The window size to use. context_function : function The function which is used to weigh the contexts. Must take a 2D matrix and return a 2D matrix of the same shape. use_focus : bool, optional, default True Whether to vectorize the focus word. norm : bool, optional, default False Whether to use the unit vectors to compose. Returns ======= phrases : Reach A reach instance containing the phrases and their vectors. """ bio_regex = re.compile(r"BI*") phrases, vectors = [], [] for idx, (txt, bio) in enumerate(documents): txt = " ".join(txt).lower().split() bio = "".join([x.split("-")[0] for x in bio]) for t in bio_regex.finditer(bio): b, e = t.span() phrase_string, vector = create_phrase_vector( txt, b, e, window, embeddings, f1, f2, context_function, use_focus, norm) # Phrase string needs to be augmented with index to make # the dictionary mapping not overwrite itself. phrase_string = "{}-{}".format(phrase_string, len(phrases)) phrases.append(phrase_string) vectors.append(vector) return Reach(vectors, phrases)
# in experiment 3. perfect = True gold = json.load(open("data/test_gold.json")) gold = list(zip(*sorted(gold.items())))[1] if perfect: data = json.load(open("data/test_gold.json")) else: data = json.load(open("data/test_uima.json")) data = list(zip(*sorted(data.items())))[1] txt, gold_bio = zip(*gold) _, data_bio = zip(*data) embeddings = Reach.load("../../corpora/mimiciii-min5-neg3-w5-100.vec", unk_word="<UNK>") concept_reach = Reach.load_fast_format("data/concept_vectors") concept_labels = json.load(open("data/names2label.json")) gold_bio = list(chain.from_iterable(gold_bio)) results_bio = {} r_phrases = compose(data, f1=np.mean, f2=np.mean, window=0, embeddings=embeddings, context_function=reciprocal)
from cat.simple import get_scores, rbf_attention from cat.dataset import restaurants_train from reach import Reach from sklearn.metrics import precision_recall_fscore_support from collections import defaultdict, Counter GAMMA = .03 BEST_ATT = {"n_noun": 980} BEST_RBF = {"n_noun": 200} if __name__ == "__main__": scores = defaultdict(dict) r = Reach.load("embeddings/restaurant_vecs_w2v.vec", unk_word="<UNK>") att = rbf_attention datums = list(restaurants_train()) d = json.load(open("data/nouns_restaurant.json")) nouns = Counter() for k, v in d.items(): if k.lower() in r.items: nouns[k.lower()] += v if att == rbf_attention: r.vectors[r.items["<UNK>"]] = r.vectors.max() if att == rbf_attention: candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"]))
scores = {} gold = json.load(open("data/beth_gold.json")) gold = list(zip(*sorted(gold.items())))[1] txt, gold_chunks = zip(*gold) data = json.load(open("data/beth_uima.json")) data = list(zip(*sorted(data.items())))[1] # Sanity check for a, b in zip(data, gold): assert len(a[0]) == len(b[0]) embeddings = Reach.load("", unk_word="UNK") scores = {} focus = experiment(data, gold_chunks, np.mean, np.mean, embeddings, reciprocal, 0, k=100, use_focus=True) full = experiment(data, gold_chunks,
parsed_train = json.load(open("data/partners_uima.json")) parsed_train = list(zip(*sorted(parsed_train.items())))[1] gold_train = json.load(open("data/partners_gold.json")) gold_train = list(zip(*sorted(gold_train.items())))[1] parsed_test = json.load(open("data/beth_uima.json")) parsed_test = list(zip(*sorted(parsed_test.items())))[1] gold_test = json.load(open("data/beth_gold.json")) gold_test = list(zip(*sorted(gold_test.items())))[1] txt, gold_chunks_train = zip(*gold_train) _, gold_chunks_test = zip(*gold_test) embeddings = Reach.load("") for a, b in zip(parsed_train, gold_train): assert len(a[0]) == len(b[0]) for a, b in zip(parsed_test, gold_test): assert len(a[0]) == len(b[0]) knn_focus = experiment(parsed_train, gold_chunks_train, parsed_test, gold_chunks_test, np.mean, np.mean, embeddings, reciprocal,
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) diora = trainer.net.diora # 1. Get all relevant phrase vectors. dtype = { 'example_ids': 'list', 'labels': 'list', 'positions': 'list', 'sizes': 'list', 'phrases': 'list', 'inside': 'torch', 'outside': 'torch', } batch_recorder = BatchRecorder(dtype=dtype) # Eval mode. trainer.net.eval() batches = validation_iterator.get_iterator(random_seed=options.seed) logger.info('Beginning to embed phrases.') strings = [] with torch.no_grad(): for i, batch_map in enumerate(batches): sentences = batch_map['sentences'] length = sentences.shape[1] # Skips very short examples. if length <= 2: continue strings.extend([ "".join([idx2word[idx] for idx in x]) for x in sentences.numpy() ]) trainer.step(batch_map, train=False, compute_loss=False) batch_result = {} batch_result['inside'] = diora.inside_h[:, -1] batch_result['outside'] = diora.outside_h[:, -1] batch_recorder.record(**batch_result) result = batch_recorder.get_flattened_result() # 2. Build an index of nearest neighbors. vectors = np.concatenate([result['inside'], result['outside']], axis=1) print(len(strings), vectors.shape) r = Reach(vectors, strings) for s in strings: print(s) print(r.most_similar(s))
vec = embeddings.vectorize(desc, remove_oov=True) if not np.any(vec): continue concept.append(np.mean(vec, axis=0)) except ValueError: pass if not concept: continue concept_names.append(name) vectors.append(np.array(concept).mean(axis=0)) r = Reach(np.array(vectors), concept_names) return r if __name__ == "__main__": path_to_embeddings = "" r_1 = Reach.load(path_to_embeddings, unk_word="UNK") concepts = json.load(open("data/all_concepts.json")) sty = json.load(open("data/concept_label.json")) r = create_concepts(concepts, r_1, include_np=True, labels=sty) r.save_fast_format("data/concept_vectors") name2label = {k: sty[k.split("-")[0]] for k in r.items()} json.dump(name2label, open("data/names2label.json", 'w'))
def grid_search(devcorpus, candidates_list, language): """ Conduct grid search to find best parameters for a corpus containing only in-vector-vocabulary corrections :param devcorpus: devcorpus generated with make_devcorpus.py :param candidates_list: list of candidate list per misspelling :param language: language from ["en", "nl"] :return: dictionary with parameter settings as keys and their correction accuracy as values """ # default parameters parameters = { 'comp_function': 'sum', 'include_misspelling': False, 'include_oov_candidates': False, 'window_size': 6, 'reciprocal': False, 'remove_stopwords': False, 'edit_distance': 1, 'oov_penalty': 1.5, 'ranking_method': 'context', 'k-best': 1 } dev = Development(parameters, language) print("Loading embeddings") r = Reach.load(dev.pathtovectors, header=True) print("Done") corrected_list = devcorpus[0] detection_list = devcorpus[1] detection_contexts = devcorpus[2] scores_dict = {} start_time = 0 end_time = 0 for comp_function in ["sum", "mult", "max"]: print("New run") run_time = end_time - start_time print("Last run took " + str(run_time) + " seconds") start_time = time.time() dev.comp_function = comp_function for include_misspelling in [True, False]: dev.include_misspelling = include_misspelling for window_size in range(11): dev.window_size = window_size for reciprocal in [True, False]: dev.reciprocal = reciprocal for remove_stopwords in [True, False]: dev.remove_stopwords = remove_stopwords for edit_distance in range(1, 5): dev.edit_distance = edit_distance correction_list = dev.ranking_experiment( detection_list, detection_contexts, candidates_list, r) accuracy = len([ c for i, c in enumerate(correction_list) if c == corrected_list[i] ]) / len(correction_list) parameters = (comp_function, include_misspelling, window_size, reciprocal, remove_stopwords, edit_distance) scores_dict[parameters] = accuracy end_time = time.time() return scores_dict
# in experiment 3. perfect = False gold = json.load(open("data/test_gold.json")) gold = list(zip(*sorted(gold.items())))[1] if perfect: data = json.load(open("data/test_gold.json")) else: data = json.load(open("data/test_uima.json")) data = list(zip(*sorted(data.items())))[1] txt, gold_bio = zip(*gold) _, data_bio = zip(*data) embeddings = Reach.load("", unk_word="UNK") concept_reach = Reach.load_fast_format("data/concept_vectors") concept_labels = json.load(open("data/concept_names2label.json")) gold_bio = list(chain.from_iterable(gold_bio)) results_bio = {} r_phrases = compose(data, f1=np.mean, f2=np.mean, window=0, embeddings=embeddings, context_function=reciprocal) pred_bio_focus = eval_extrinsic(list(chain.from_iterable(data_bio)),
def synonym_retrieval_zeroshot(self, zeroshot_pairs, isolated=False, verbose=False, outfile=''): assert self.train_vectors != None, 'No train vectors are loaded yet!' assert self.test_vectors != None, 'No test vectors are loaded yet!' # new setting: add ALL zeroshot data to train data to cause more confusion train_items = [ x for _, x in sorted(self.train_vectors.indices.items()) ] train_vectors = self.train_vectors.vectors zeroshot_items = set() for concept, reference, synonyms in zeroshot_pairs: zeroshot_items.add(reference) zeroshot_items.update(synonyms) zeroshot_items = sorted(zeroshot_items) zeroshot_vectors = [] for zeroshot_item in zeroshot_items: zeroshot_vectors.append(self.test_vectors[zeroshot_item]) if isolated: fused_vectors = Reach(zeroshot_vectors, zeroshot_items) else: all_items = train_items + zeroshot_items zeroshot_vectors = np.array(zeroshot_vectors) all_vectors = np.concatenate((train_vectors, zeroshot_vectors), axis=0) fused_vectors = Reach(all_vectors, all_items) # now rank complete_ranking = [] for instance in tqdm(zeroshot_pairs, disable=False): concept, reference, synonyms = instance synonym_idxs = [fused_vectors.items[syn] for syn in synonyms] reference_idx = fused_vectors.items[reference] # calculate distances reference_vector = fused_vectors.norm_vectors[reference_idx] scores = fused_vectors.norm_vectors.dot(reference_vector.T) # extract ranking mask = [ 1 if x == reference_idx else 0 for x in range(len(fused_vectors.items)) ] scores = np.ma.array(scores, mask=mask) ranking = np.argsort(-scores) ranks = [ np.where(ranking == synonym_idx)[0][0] for synonym_idx in synonym_idxs ] assert ranks ranks, synonyms = zip(*sorted(zip(ranks, synonyms))) instance = (concept, reference, synonyms) complete_ranking.append((instance, ranks)) if outfile: print('Saving...') with open(outfile, 'w') as f: json.dump(complete_ranking, f) if verbose: instances, rankings = zip(*complete_ranking) print(round(self.mean_average_precision(rankings), 2), '&', round(self.ranking_accuracy(rankings), 2), '&', round(self.mean_reciprocal_rank(rankings), 2), '&') return complete_ranking