def cardinality_embeddings(): ''' ------------------------------------------------------------------------------------------------------------- The method gets no input and returns a dictionary with information about the cardinality of embeddings. The outcome corresponds to the variable cardinality_vembs stored in Utilities. ------------------------------------------------------------------------------------------------------------- ''' a = datetime.datetime.now().replace(microsecond=0) cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if (f.is_file())&(f.name != 'README.md')]) words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if (f.is_file())&(f.name != 'README.md')]) embeddings = [cuis, words] cardinality_vemb = {} for type_emb in embeddings: for emb in type_emb[1]: model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin')) name = os.path.splitext(emb)[0] vemb = utils.extract_w2v_vocab(model) cardinality_vemb[name] = len(vemb) #utils.inputs_save(cardinality_vemb, 'Utilities/cardinality_vembs') print(datetime.datetime.now().replace(microsecond=0)-a) return cardinality_vemb
def cardinality_kl(embeddings, useful_rela, L_umls, K_umls, dict_labels_for_L=None): ''' ---------------------------------------------------------------------------------------------------------------- The method returns a pickle dictionary variable reusable by other methods for plotting. It gets as input a list of strings, with the names of analyzed embeddings, a list of analyzed relationships and two pairs sets, K and L. The variable dict_labels_for_L is not compulsary: it is used only by the filtering k_n_l_iov method for w2v embeddings. The method return informations about the cardinality of original UMLS sets and the filtered IoV one, given an embedding. ---------------------------------------------------------------------------------------------------------------- ''' a = datetime.datetime.now().replace(microsecond=0) sets_relations_k = {} sets_relations_l = {} # Loop over the relations for type_emb in embeddings: for emb in type_emb[1]: model = KeyedVectors.load_word2vec_format( PATH_EMBEDDINGS + type_emb[0] + emb, binary=emb.endswith('.bin')) name = os.path.splitext(emb)[0] print('Embedding ' + str(name) + ' is analyzed') for rela in useful_rela: c = datetime.datetime.now().replace(microsecond=0) # Check type of embedding if type_emb[0] == '/cuis/': dict_labels_inters_vemb = None labs = 'cui' # Check type of embedding: for word embeddings the dictionary of labels per cui is required elif (type_emb[0] == '/words/') and (dict_labels_for_L is not None): # Filter the dictionary of labels keeping only the labels-words present into the embedding Vemb = utils.extract_w2v_vocab(model) dict_labels_inters_vemb = umls_tables_processing.discarding_labels_oov( Vemb, dict_labels_for_L) labs = 'labels' # Filtering L and K sets for present labels inside the embedding l0, k0 = measures.k_n_l_iov( L_umls[rela], K_umls[rela], model, dict_labels_for_L=dict_labels_inters_vemb, emb_type=labs) # Store number of filtered pairs sets_relations_l[rela] = {name: np.shape(l0)[0]} sets_relations_k[rela] = {name: np.shape(k0)[0]} print('Execution time for rela ' + rela + ' : ' + str(datetime.datetime.now().replace(microsecond=0) - c) + '\n') sets_relations_l[rela] = {'L': np.shape(L_umls[rela])[0]} sets_relations_k[rela] = {'K': np.shape(K_umls[rela])[0]} sets_relations_l[rela] = { 'L wor': np.shape(list(set(L_umls[rela])))[0] } sets_relations_k[rela] = { 'K wor': np.shape(list(set(K_umls[rela])))[0] } utils.inputs_save( sets_relations_l, SAVING_PATH + 'l_cardinality_per_rel' + str(datetime.datetime.now())) utils.inputs_save( sets_relations_k, SAVING_PATH + 'k_cardinality_per_rel' + str(datetime.datetime.now())) print('Execution time : ' + str(datetime.datetime.now().replace(microsecond=0) - a) + '\n')
def analog_loop( path, binary_bool, name, type_emb, L, K, K_type, logger, analog_comp_dict, #sets_relations, metrics, dict_labels_for_L=None, all_labels=False): ''' ------------------------------------------------------------------------------------------------------------------- The method implements the logic: it recalls the choosen metrics for analogy computation. The method presents two different logics, one for cuis and another for words: for the second one The method gets as inputs the loading model path, provided by analog_pipe, with a boolean representing the extension of the embedding file model (if binary or not). A string representing the name of the embedding, the type of embedding, /cuis/ or /words/, the two sets coming from analog_pipe. For further description about the sets, K_type, logger, metrics see there. analog_comp_dict variable is the global dictionary, one for embedding: it is the outcome of the method. It has to stored once per relationship, for avoiding sudden break of computation and losing information. dict_labels_for_l and all_labels are herited by parent methods, and they're varaibles useful for label processing logic. ------------------------------------------------------------------------------------------------------------------- ''' # Load the w2v model model = KeyedVectors.load_word2vec_format(path, binary=binary_bool) # Instantiation and log print analog_comp_dict[name] = {} logger.info('\n\n The name of embedding is: %s\n', name) dict_t = {} dict_t[name] = {} # Loop over the relations for rela in umls_tables_processing.USEFUL_RELA: logger.info('\n The RELA is: %s\n', rela) # Check type of embedding if type_emb == '/cuis/': c = datetime.datetime.now().replace(microsecond=0) l0, k0 = measures.k_n_l_iov(L[rela], K[rela], model, logger=logger, emb_type='cui') # Compute the analogy and store the results tmp = measures.analogy_compute(l0, k0, model, metrics, logger=logger, emb_type='cui') dict_t[name][rela] = tmp utils.inputs_save( dict_t, SAVING_PATH + name + K_type + str(datetime.datetime.now())) # Log of end of 'relation' operation logger.info( 'The time for RELA %s, for embedding %s is %s', rela, name, str(datetime.datetime.now().replace(microsecond=0) - c)) # Check type of embedding: for word embeddings the dictionary of labels per cui is required elif (type_emb == '/words/') and (dict_labels_for_L is not None): c = datetime.datetime.now().replace(microsecond=0) # Filter the dictionary of labels keeping only the labels-words present into the embedding Vemb = utils.extract_w2v_vocab(model) dict_labels_inters_vemb = umls_tables_processing.discarding_labels_oov( Vemb, dict_labels_for_L, all_labels=all_labels) # Filtering L and K sets for present labels inside the embedding l0, k0 = measures.k_n_l_iov( L[rela], K[rela], model, logger=logger, dict_labels_for_L=dict_labels_inters_vemb, emb_type='labels') tmp = measures.analogy_compute( l0, k0, model, metrics, logger=logger, dict_labels_for_L=dict_labels_inters_vemb, emb_type='labels') dict_t[name][rela] = tmp utils.inputs_save( dict_t, SAVING_PATH + name + K_type + str(datetime.datetime.now())) logger.info( 'The time for RELA %s, for embedding %s is %s', rela, name, str(datetime.datetime.now().replace(microsecond=0) - c))
def k_n_l_iov(L_umls_rel, K_umls_rel, model, logger=None, dict_labels_for_L=None, emb_type='cui'): ''' ---------------------------------------------------------------------------------------------------- Accessory method: it preprocesses the K and L sets, discarding the pairs with OOV elements, in both the sets. This allows a cut of computational cost, fasting the computation of analogy. The method gets as input a list of pairs, given a relation, for L. A list of pairs for the same relation for K set. The model of considered embedding. A logger is not compulsary: it is tought for background run on vm and for keeping track of errors. dict_labels_for_L is compulsary only for emb_type = 'labels' case. It is the dictionary with all the unique concepts of L set. For each key-concept the dictionary has as value a list of labels iov The method returns the polished K and L sets of pairs, all iov. ---------------------------------------------------------------------------------------------------- ''' # Timer started ab = datetime.datetime.now().replace(microsecond=0) # Changing format to the two lists, K_umls and L_umls print(np.shape(np.array(list(zip(*L_umls_rel))))) l_x = np.array(list(zip(*L_umls_rel))[0]) l_y = np.array(list(zip(*L_umls_rel))[1]) l_stacked = np.stack((l_x, l_y)) k_x = np.array(list(zip(*K_umls_rel))[0]) k_y = np.array(list(zip(*K_umls_rel))[1]) k_stacked = np.stack((k_x, k_y)) stacked = [l_stacked, k_stacked] q = [] # Optimization for the case where L and K are the same, for avoiding a doubled computation if L_umls_rel == K_umls_rel: var = [[l_x, l_y]] print('L=k') else: var = [[l_x, l_y], [k_x, k_y]] print('L!=k') if emb_type == 'cui': # Extraction Vemb Vemb = np.array(list(utils.extract_w2v_vocab(model))) # Extracting indeces of Vemb, sorting values in growing way. index = np.argsort(Vemb) # Sorting Vemb in a growing way. I dont understand the meaning with strings sorted_Vemb = Vemb[index] # Making presence masks for discarding pairs oov by the Vemb for j in var: temp = [] # with j the two elements of pair for i in j: sorted_index_i = np.searchsorted(sorted_Vemb, i) yindex = np.take(index, sorted_index_i, mode="clip") # Keep track the concepts which dont follow the condition: False for the ones # with labels inside the vocabulary True for the ones oov mask = Vemb[yindex] != i array_ids = np.where(mask) # Mask matrix polished tmp = array_ids[0].tolist() # The indeces of the first element of the pair are added to the indeces of # the second element of the pair. A list of indeces is obtained temp = temp + tmp # It avoids repetition of indeces q.append(list(set(temp))) elif (emb_type == 'labels') and (dict_labels_for_L is not None): # Polishing the set L keeping only concepts having labels IoV #dict_labels_iov = umls_tables_processing.discarding_labels_oov(Vemb, dict_labels_for_L) # Making presence masks for discarding pairs oov by the Vemb for j in var: temp = [] for i in j: # Keep track the concepts which dont follow the condition: False for the ones # with labels inside the vocabulary True for the ones oov mask = np.array([ False if len(dict_labels_for_L[u]) > 0 else True for u in i ]) array_ids = np.where(mask) tmp = array_ids[0].tolist() # The indeces of the first element of the pair are added to the indeces of # the second element of the pair. A list of indeces is obtained temp = temp + tmp # It avoids repetition of indeces inside the list q.append(list(set(temp))) # Applying the mask to the previous stacked arrays tu = [] for k, s in zip(q, stacked): # The sum of the indeces from the two elements of pairs doesnt sort the values # For avoiding bugs, the indeces are sorted before deleting the correspondent elements. k = np.sort(k) # Check for avoiding empty lists processing if len(k) > 0: # Deletion of stored indeces. polished_ = np.delete(s, np.array(k), 1) new_k_umls = map(tuple, polished_.transpose()) new_k_umls = list(new_k_umls) tu.append(new_k_umls) else: tu.append([]) print(datetime.datetime.now().replace(microsecond=0) - ab) if logger: logger.info(str(datetime.datetime.now().replace(microsecond=0) - ab)) if len(tu) == 1: print(len(tu[0])) return list(set(tu[0])), list(set(tu[0])) print(len(tu[0])) # Returning data with same format of input return list(set(tu[0])), list(set(tu[1]))
def max_ks_loop(big_g, seeds, type_emb, model, name, logger, all_labels = False, aggregation = 'max'): ''' ------------------------------------------------------------------------------------------------------------- The method represents a further loop inside the relatedness loop, for the case k_most_similar = k_max experimentation. Following a similar logic to the one implemented in regular_ks_loop, it is splitted into two section, one for the cuis and another one for words. A system of debugging print is implemented via a logger. The method gets as input a global variable, big_g, which is used for storing the values: at the ks choosen with the parsing with the ks list variable, the k = IoV is added. The method is recalled thanks to a switch in parent methods. The method gets a list of seeds, the type of embedding, the embedding which has to be analyzed, the name of the analyzed embedding, the logger and a switch for choosing all the labels or only the first ranked by UMLS inside the embedding's vocabulary at the same time. ------------------------------------------------------------------------------------------------------------- ''' big_g[name]['max_k'] = {} for seed in seeds: Vemb = utils.extract_w2v_vocab(model) if type_emb[0]=='/cuis/': k = len(list(set(Vemb).intersection(set(seed[1].keys())))) logger.info('\n k_value: %s\n', k) if k <=0: k = 1 d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k) big_g[name]['max_k'][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.percentage_dcg(d, k=k), k, measures.oov(d), len(seed[1]), []] elif type_emb[0]=='/words/': processed_seed = umls_tables_processing.discarding_labels_oov(Vemb, seed[1], all_labels = all_labels) k = sum([1 for k,v in processed_seed.items() if len(v)>0]) logger.info('\n k_value: %s\n', k) if k <= 0: k = 1 d, _ = measures.occurred_labels(model, processed_seed, k_most_similar=k, heuristic_aggregation = aggregation) big_g[name]['max_k'][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.percentage_dcg(d, k=k), k, measures.oov(d), len(seed[1]), []] logger.info('%s: pos_dcg: %.4f, neg_dcg: %.4f, perc_dcg: %.4f, iov/k-NN: %d, oov: %d, #seed: %d\n', seed[0], big_g[name]['max_k'][seed[0]][0], big_g[name]['max_k'][seed[0]][1], big_g[name]['max_k'][seed[0]][2], big_g[name]['max_k'][seed[0]][3], big_g[name]['max_k'][seed[0]][4], big_g[name]['max_k'][seed[0]][5]) return big_g
def regular_ks_loop(embeddings, ks, seeds, logger, max_k_switch, all_labels = False, aggregation = 'max'): ''' ------------------------------------------------------------------------------------------------------------- The method implements the logic for relatedness and occurrence experimentation. It is splitted in two blocks: one for cuis and another one for labels. Iteratively it loads an embedding model, among the ones inside the folder indicated in PATH_EMBEDDINGS, and applies the occurrence and dcg measures. The experimentation is performed for the choosen ks got via parsing, in the variable ks, for the seeds in seeds list. A logger is provided for debugging and the max_k_switch enable the max_ks_loop. The all_labels switch allows to take all the labels for a concept or only the best ranked by UMLS. The method returns the global variable big_g ------------------------------------------------------------------------------------------------------------- ''' big_g = {} a = datetime.datetime.now().replace(microsecond=0) for type_emb in embeddings: for emb in type_emb[1]: model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin')) name = os.path.splitext(emb)[0] big_g[name] = {} logger.info('\n\n The name of embedding is: %s\n', name) for i, k in enumerate(ks): logger.info('\n k_value: %s\n', k) big_g[name][k] = {} for seed in seeds: if type_emb[0]=='/cuis/': d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k) big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.percentage_dcg(d, k=k), measures.iov(d), measures.oov(d), len(seed[1]), []] elif type_emb[0]=='/words/': Vemb = utils.extract_w2v_vocab(model) processed_seed = umls_tables_processing.discarding_labels_oov(Vemb, seed[1], all_labels = all_labels) d, new_seed = measures.occurred_labels(model, processed_seed, k_most_similar=k, heuristic_aggregation = aggregation) big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)), measures.percentage_dcg(d, k=k), measures.iov(d), measures.oov(d), len(seed[1]), new_seed] logger.info('%s: pos_dcg: %.4f, neg_dcg: %.4f, perc_dcg: %.4f, iov: %d, oov: %d, #seed: %d\n', seed[0], big_g[name][k][seed[0]][0], big_g[name][k][seed[0]][1], big_g[name][k][seed[0]][2], big_g[name][k][seed[0]][3], big_g[name][k][seed[0]][4], big_g[name][k][seed[0]][5]) if max_k_switch and (i == len(ks)-1): big_g = max_ks_loop(big_g, seeds, type_emb, model, name, logger, all_labels = all_labels, aggregation = aggregation) logger.info('Time for relatedness pipeline computation is: %s', str(datetime.datetime.now().replace(microsecond=0)-a)) return big_g
def relation_pipe(embeddings, L_umls, K_umls, dict_conso, logger, all_labels=False): full_relations = {} full_pairs = {} timer_global = datetime.datetime.now().replace(microsecond=0) for type_emb in embeddings: for emb in type_emb[1]: timer_emb = datetime.datetime.now().replace(microsecond=0) # Embedding model = KeyedVectors.load_word2vec_format( PATH_EMBEDDINGS + type_emb[0] + emb, binary=emb.endswith('.bin')) logger.info('Embedding %s loaded\n', emb) if type_emb[0] == '/cuis/': lab = 'cui' dict_labels_inters_vemb = None if type_emb[0] == '/words/': lab = 'labels' # Filter the dictionary of labels keeping only the labels-words present into the embedding Vemb = utils.extract_w2v_vocab(model) dict_labels_inters_vemb = umls_tables_processing.discarding_labels_oov( Vemb, dict_conso, all_labels=all_labels) pcas_l = rela_direction(model, lab, ALL_RELAS, L_umls, logger, dict_=dict_labels_inters_vemb, all_labels=all_labels) pcas_k = rela_direction(model, lab, ALL_RELAS, K_umls, logger, dict_=dict_labels_inters_vemb, all_labels=all_labels) temp = distance_among_relations(pcas_l, pcas_k, RELAS, logger=logger, all_labels=all_labels) # Distance #temp_dict = distance_pairs_components(K_umls, # pcas_l, # model, # ALL_RELAS, # lab, # dict_labels = dict_labels_inters_vemb, # logger = logger, # all_labels = all_labels) full_relations[emb] = temp #full_pairs[emb] = temp_dict # Here it can be implementable the distance_pairs_components method logger.info( 'The time for computation of %s is %s\n', emb, str(datetime.datetime.now().replace(microsecond=0) - timer_emb)) logger.info( 'The time for global pcas distance computation is %s\n', str(datetime.datetime.now().replace(microsecond=0) - timer_global)) return full_relations #, full_pairs