def cardinality_embeddings():
    '''
    -------------------------------------------------------------------------------------------------------------
    The method gets no input and returns a dictionary with information about the cardinality of embeddings.
    
    The outcome corresponds to the variable cardinality_vembs stored in Utilities.
    -------------------------------------------------------------------------------------------------------------    
    '''
    a = datetime.datetime.now().replace(microsecond=0)
    cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if (f.is_file())&(f.name != 'README.md')])
    words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if (f.is_file())&(f.name != 'README.md')])
    embeddings = [cuis, words]
    cardinality_vemb = {}
    for type_emb in embeddings:
        for emb in type_emb[1]:
            model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin'))
            name = os.path.splitext(emb)[0]
            vemb = utils.extract_w2v_vocab(model)
            cardinality_vemb[name] = len(vemb)

    #utils.inputs_save(cardinality_vemb, 'Utilities/cardinality_vembs')
    print(datetime.datetime.now().replace(microsecond=0)-a)
    return cardinality_vemb
def cardinality_kl(embeddings,
                   useful_rela,
                   L_umls,
                   K_umls,
                   dict_labels_for_L=None):
    '''
    ----------------------------------------------------------------------------------------------------------------
    The method returns a pickle dictionary variable reusable by other methods for plotting.
    
    It gets as input a list of strings, with the names of analyzed embeddings, a list of analyzed relationships
    and two pairs sets, K and L. The variable dict_labels_for_L is not compulsary: it is used only by the filtering
    k_n_l_iov method for w2v embeddings.
    
    The method return informations about the cardinality of original UMLS sets and the filtered IoV one, given an 
    embedding.
    ----------------------------------------------------------------------------------------------------------------
    '''
    a = datetime.datetime.now().replace(microsecond=0)
    sets_relations_k = {}
    sets_relations_l = {}
    # Loop over the relations
    for type_emb in embeddings:
        for emb in type_emb[1]:
            model = KeyedVectors.load_word2vec_format(
                PATH_EMBEDDINGS + type_emb[0] + emb,
                binary=emb.endswith('.bin'))
            name = os.path.splitext(emb)[0]
            print('Embedding ' + str(name) + ' is analyzed')
            for rela in useful_rela:
                c = datetime.datetime.now().replace(microsecond=0)
                # Check type of embedding
                if type_emb[0] == '/cuis/':
                    dict_labels_inters_vemb = None
                    labs = 'cui'

                # Check type of embedding: for word embeddings the dictionary of labels per cui is required
                elif (type_emb[0] == '/words/') and (dict_labels_for_L
                                                     is not None):
                    # Filter the dictionary of labels keeping only the labels-words present into the embedding
                    Vemb = utils.extract_w2v_vocab(model)
                    dict_labels_inters_vemb = umls_tables_processing.discarding_labels_oov(
                        Vemb, dict_labels_for_L)
                    labs = 'labels'

                # Filtering L and K sets for present labels inside the embedding
                l0, k0 = measures.k_n_l_iov(
                    L_umls[rela],
                    K_umls[rela],
                    model,
                    dict_labels_for_L=dict_labels_inters_vemb,
                    emb_type=labs)

                # Store number of filtered pairs
                sets_relations_l[rela] = {name: np.shape(l0)[0]}
                sets_relations_k[rela] = {name: np.shape(k0)[0]}
                print('Execution time for rela ' + rela + ' : ' +
                      str(datetime.datetime.now().replace(microsecond=0) - c) +
                      '\n')

        sets_relations_l[rela] = {'L': np.shape(L_umls[rela])[0]}
        sets_relations_k[rela] = {'K': np.shape(K_umls[rela])[0]}
        sets_relations_l[rela] = {
            'L wor': np.shape(list(set(L_umls[rela])))[0]
        }
        sets_relations_k[rela] = {
            'K wor': np.shape(list(set(K_umls[rela])))[0]
        }

    utils.inputs_save(
        sets_relations_l,
        SAVING_PATH + 'l_cardinality_per_rel' + str(datetime.datetime.now()))
    utils.inputs_save(
        sets_relations_k,
        SAVING_PATH + 'k_cardinality_per_rel' + str(datetime.datetime.now()))
    print('Execution time : ' +
          str(datetime.datetime.now().replace(microsecond=0) - a) + '\n')
def analog_loop(
        path,
        binary_bool,
        name,
        type_emb,
        L,
        K,
        K_type,
        logger,
        analog_comp_dict,
        #sets_relations,
        metrics,
        dict_labels_for_L=None,
        all_labels=False):
    '''
    -------------------------------------------------------------------------------------------------------------------
    The method implements the logic: it recalls the choosen metrics for analogy computation.
    The method presents two different logics, one for cuis and another for words: for the second one
    
    The method gets as inputs the loading model path, provided by analog_pipe, with a boolean representing the extension
    of the embedding file model (if binary or not). A string representing the name of the embedding, the type of 
    embedding, /cuis/ or /words/, the two sets coming from analog_pipe. For further description about the sets, K_type, 
    logger, metrics see there.
    
    analog_comp_dict variable is the global dictionary, one for embedding: it is the outcome of the method. It has to 
    stored once per relationship, for avoiding sudden break of computation and losing information.
    
    dict_labels_for_l and all_labels are herited by parent methods, and they're varaibles useful for label processing 
    logic.
    -------------------------------------------------------------------------------------------------------------------
    '''

    # Load the w2v model
    model = KeyedVectors.load_word2vec_format(path, binary=binary_bool)

    # Instantiation and log print
    analog_comp_dict[name] = {}
    logger.info('\n\n The name of embedding is: %s\n', name)
    dict_t = {}
    dict_t[name] = {}

    # Loop over the relations
    for rela in umls_tables_processing.USEFUL_RELA:
        logger.info('\n The RELA is: %s\n', rela)

        # Check type of embedding
        if type_emb == '/cuis/':
            c = datetime.datetime.now().replace(microsecond=0)
            l0, k0 = measures.k_n_l_iov(L[rela],
                                        K[rela],
                                        model,
                                        logger=logger,
                                        emb_type='cui')

            # Compute the analogy and store the results
            tmp = measures.analogy_compute(l0,
                                           k0,
                                           model,
                                           metrics,
                                           logger=logger,
                                           emb_type='cui')
            dict_t[name][rela] = tmp

            utils.inputs_save(
                dict_t,
                SAVING_PATH + name + K_type + str(datetime.datetime.now()))

            # Log of end of 'relation' operation
            logger.info(
                'The time for RELA %s, for embedding %s is %s', rela, name,
                str(datetime.datetime.now().replace(microsecond=0) - c))

        # Check type of embedding: for word embeddings the dictionary of labels per cui is required
        elif (type_emb == '/words/') and (dict_labels_for_L is not None):
            c = datetime.datetime.now().replace(microsecond=0)

            # Filter the dictionary of labels keeping only the labels-words present into the embedding
            Vemb = utils.extract_w2v_vocab(model)
            dict_labels_inters_vemb = umls_tables_processing.discarding_labels_oov(
                Vemb, dict_labels_for_L, all_labels=all_labels)
            # Filtering L and K sets for present labels inside the embedding
            l0, k0 = measures.k_n_l_iov(
                L[rela],
                K[rela],
                model,
                logger=logger,
                dict_labels_for_L=dict_labels_inters_vemb,
                emb_type='labels')

            tmp = measures.analogy_compute(
                l0,
                k0,
                model,
                metrics,
                logger=logger,
                dict_labels_for_L=dict_labels_inters_vemb,
                emb_type='labels')

            dict_t[name][rela] = tmp

            utils.inputs_save(
                dict_t,
                SAVING_PATH + name + K_type + str(datetime.datetime.now()))

            logger.info(
                'The time for RELA %s, for embedding %s is %s', rela, name,
                str(datetime.datetime.now().replace(microsecond=0) - c))
Пример #4
0
def k_n_l_iov(L_umls_rel,
              K_umls_rel,
              model,
              logger=None,
              dict_labels_for_L=None,
              emb_type='cui'):
    '''
    ----------------------------------------------------------------------------------------------------
    Accessory method: it preprocesses the K and L sets, discarding the pairs with OOV elements, in 
    both the sets. This allows a cut of computational cost, fasting the computation of analogy.
    
    The method gets as input a list of pairs, given a relation, for L. 
    A list of pairs for the same relation for K set.
    The model of considered embedding.
    A logger is not compulsary: it is tought for background run on vm and for keeping track of errors.
    dict_labels_for_L is compulsary only for emb_type = 'labels' case. It is the dictionary with all
    the unique concepts of L set. For each key-concept the dictionary has as value a list of labels iov 
    
    The method returns the polished K and L sets of pairs, all iov.
    ----------------------------------------------------------------------------------------------------
    '''
    # Timer started
    ab = datetime.datetime.now().replace(microsecond=0)
    # Changing format to the two lists, K_umls and L_umls
    print(np.shape(np.array(list(zip(*L_umls_rel)))))
    l_x = np.array(list(zip(*L_umls_rel))[0])
    l_y = np.array(list(zip(*L_umls_rel))[1])
    l_stacked = np.stack((l_x, l_y))

    k_x = np.array(list(zip(*K_umls_rel))[0])
    k_y = np.array(list(zip(*K_umls_rel))[1])
    k_stacked = np.stack((k_x, k_y))

    stacked = [l_stacked, k_stacked]

    q = []

    # Optimization for the case where L and K are the same, for avoiding a doubled computation
    if L_umls_rel == K_umls_rel:
        var = [[l_x, l_y]]
        print('L=k')
    else:
        var = [[l_x, l_y], [k_x, k_y]]
        print('L!=k')

    if emb_type == 'cui':
        # Extraction Vemb
        Vemb = np.array(list(utils.extract_w2v_vocab(model)))
        # Extracting indeces of Vemb, sorting values in growing way.
        index = np.argsort(Vemb)
        # Sorting Vemb in a growing way. I dont understand the meaning with strings
        sorted_Vemb = Vemb[index]
        # Making presence masks for discarding pairs oov by the Vemb
        for j in var:
            temp = []
            # with j the two elements of pair
            for i in j:
                sorted_index_i = np.searchsorted(sorted_Vemb, i)
                yindex = np.take(index, sorted_index_i, mode="clip")
                # Keep track the concepts which dont follow the condition: False for the ones
                # with labels inside the vocabulary True for the ones oov
                mask = Vemb[yindex] != i
                array_ids = np.where(mask)
                # Mask matrix polished
                tmp = array_ids[0].tolist()
                # The indeces of the first element of the pair are added to the indeces of
                # the second element of the pair. A list of indeces is obtained
                temp = temp + tmp
            # It avoids repetition of indeces
            q.append(list(set(temp)))

    elif (emb_type == 'labels') and (dict_labels_for_L is not None):
        # Polishing the set L keeping only concepts having labels IoV
        #dict_labels_iov = umls_tables_processing.discarding_labels_oov(Vemb, dict_labels_for_L)
        # Making presence masks for discarding pairs oov by the Vemb
        for j in var:
            temp = []
            for i in j:
                # Keep track the concepts which dont follow the condition: False for the ones
                # with labels inside the vocabulary True for the ones oov
                mask = np.array([
                    False if len(dict_labels_for_L[u]) > 0 else True for u in i
                ])
                array_ids = np.where(mask)
                tmp = array_ids[0].tolist()
                # The indeces of the first element of the pair are added to the indeces of
                # the second element of the pair. A list of indeces is obtained
                temp = temp + tmp
            # It avoids repetition of indeces inside the list
            q.append(list(set(temp)))
    # Applying the mask to the previous stacked arrays
    tu = []
    for k, s in zip(q, stacked):
        # The sum of the indeces from the two elements of pairs doesnt sort the values
        # For avoiding bugs, the indeces are sorted before deleting the correspondent elements.
        k = np.sort(k)
        # Check for avoiding empty lists processing
        if len(k) > 0:
            # Deletion of stored indeces.
            polished_ = np.delete(s, np.array(k), 1)
            new_k_umls = map(tuple, polished_.transpose())
            new_k_umls = list(new_k_umls)
            tu.append(new_k_umls)
        else:
            tu.append([])

    print(datetime.datetime.now().replace(microsecond=0) - ab)
    if logger:
        logger.info(str(datetime.datetime.now().replace(microsecond=0) - ab))

    if len(tu) == 1:
        print(len(tu[0]))
        return list(set(tu[0])), list(set(tu[0]))

    print(len(tu[0]))
    # Returning data with same format of input
    return list(set(tu[0])), list(set(tu[1]))
def max_ks_loop(big_g, seeds, type_emb, model, name, logger, all_labels = False, aggregation = 'max'):
    '''
    -------------------------------------------------------------------------------------------------------------
    The method represents a further loop inside the relatedness loop, for the case k_most_similar = k_max 
    experimentation. 
    
    Following a similar logic to the one implemented in regular_ks_loop, it is splitted into two section, one
    for the cuis and another one for words. A system of debugging print is implemented via a logger. 
    
    The method gets as input a global variable, big_g, which is used for storing the values: at the ks choosen 
    with the parsing with the ks list variable, the k = IoV is added. The method is recalled thanks to a switch in
    parent methods.
    
    The method gets a list of seeds, the type of embedding, the embedding which has to be analyzed, the name of 
    the analyzed embedding, the logger and a switch for choosing all the labels or only the first ranked by UMLS
    inside the embedding's vocabulary at the same time.    
    -------------------------------------------------------------------------------------------------------------
    '''
    big_g[name]['max_k'] = {}    
    for seed in seeds:
        Vemb = utils.extract_w2v_vocab(model)
        if type_emb[0]=='/cuis/':
            k = len(list(set(Vemb).intersection(set(seed[1].keys()))))
            logger.info('\n k_value: %s\n', k)
            if k <=0:
                k = 1
            d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k)
            big_g[name]['max_k'][seed[0]] = [measures.pos_dcg(d, 
                                                        normalization = True, 
                                                        norm_fact = measures.max_dcg(k)),
                                       measures.neg_dcg(d, 
                                                        normalization = True,
                                                        norm_fact = measures.max_dcg(k)),
                                       measures.percentage_dcg(d, k=k),
                                       k,
                                       measures.oov(d),
                                       len(seed[1]), []]

        elif type_emb[0]=='/words/':
            processed_seed = umls_tables_processing.discarding_labels_oov(Vemb, seed[1], all_labels = all_labels)
            k = sum([1 for k,v in processed_seed.items() if len(v)>0])
            logger.info('\n k_value: %s\n', k)
            if k <= 0:
                k = 1
            d, _ = measures.occurred_labels(model, 
                                            processed_seed,
                                            k_most_similar=k,
                                            heuristic_aggregation = aggregation)

            big_g[name]['max_k'][seed[0]] = [measures.pos_dcg(d, 
                                                              normalization = True,
                                                              norm_fact = measures.max_dcg(k)),
                                             measures.neg_dcg(d,
                                                              normalization = True,
                                                              norm_fact = measures.max_dcg(k)),
                                             measures.percentage_dcg(d, k=k),
                                             k,
                                             measures.oov(d),
                                             len(seed[1]), []]
            
        logger.info('%s: pos_dcg: %.4f, neg_dcg: %.4f, perc_dcg: %.4f, iov/k-NN: %d, oov: %d, #seed: %d\n', 
                    seed[0],
                    big_g[name]['max_k'][seed[0]][0],
                    big_g[name]['max_k'][seed[0]][1],
                    big_g[name]['max_k'][seed[0]][2],
                    big_g[name]['max_k'][seed[0]][3],
                    big_g[name]['max_k'][seed[0]][4],
                    big_g[name]['max_k'][seed[0]][5])  
    return big_g
def regular_ks_loop(embeddings, ks, seeds, logger, max_k_switch, all_labels = False, aggregation = 'max'):
    '''
    -------------------------------------------------------------------------------------------------------------
    The method implements the logic for relatedness and occurrence experimentation. It is splitted in two blocks:
    one for cuis and another one for labels. Iteratively it loads an embedding model, among the ones inside the 
    folder indicated in PATH_EMBEDDINGS, and applies the occurrence and dcg measures.
    
    The experimentation is performed for the choosen ks got via parsing, in the variable ks, for the seeds in 
    seeds list. A logger is provided for debugging and the max_k_switch enable the max_ks_loop.
    The all_labels switch allows to take all the labels for a concept or only the best ranked by UMLS.
    
    The method returns the global variable big_g
    -------------------------------------------------------------------------------------------------------------    
    '''
    big_g = {}
    a = datetime.datetime.now().replace(microsecond=0)

    for type_emb in embeddings:
        for emb in type_emb[1]:
            model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin'))
            name = os.path.splitext(emb)[0]
            big_g[name] = {}
            logger.info('\n\n The name of embedding is: %s\n', name)
            for i, k in enumerate(ks):
                logger.info('\n k_value: %s\n', k)
                big_g[name][k] = {}
                for seed in seeds:
                    if type_emb[0]=='/cuis/':
                        d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k)
                        big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                                   measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                                   measures.percentage_dcg(d, k=k),
                                                   measures.iov(d),
                                                   measures.oov(d),
                                                   len(seed[1]), []]
                        
                    elif type_emb[0]=='/words/':
                        Vemb = utils.extract_w2v_vocab(model)
                        processed_seed = umls_tables_processing.discarding_labels_oov(Vemb, seed[1], all_labels = all_labels)
                        d, new_seed = measures.occurred_labels(model, 
                                                               processed_seed,
                                                               k_most_similar=k,
                                                               heuristic_aggregation = aggregation)
                        big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                                   measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                                   measures.percentage_dcg(d, k=k),
                                                   measures.iov(d),
                                                   measures.oov(d),
                                                   len(seed[1]),
                                                   new_seed]
                        
                    logger.info('%s: pos_dcg: %.4f, neg_dcg: %.4f, perc_dcg: %.4f, iov: %d, oov: %d, #seed: %d\n', 
                                seed[0],
                                big_g[name][k][seed[0]][0],
                                big_g[name][k][seed[0]][1],
                                big_g[name][k][seed[0]][2],
                                big_g[name][k][seed[0]][3],
                                big_g[name][k][seed[0]][4],
                                big_g[name][k][seed[0]][5])
                    
            if max_k_switch and (i == len(ks)-1):
                big_g = max_ks_loop(big_g, 
                                    seeds,
                                    type_emb,
                                    model,
                                    name,
                                    logger,
                                    all_labels = all_labels,
                                    aggregation = aggregation)
                    
    logger.info('Time for relatedness pipeline computation is: %s', str(datetime.datetime.now().replace(microsecond=0)-a))
    return big_g
def relation_pipe(embeddings,
                  L_umls,
                  K_umls,
                  dict_conso,
                  logger,
                  all_labels=False):
    full_relations = {}
    full_pairs = {}
    timer_global = datetime.datetime.now().replace(microsecond=0)
    for type_emb in embeddings:
        for emb in type_emb[1]:
            timer_emb = datetime.datetime.now().replace(microsecond=0)
            # Embedding
            model = KeyedVectors.load_word2vec_format(
                PATH_EMBEDDINGS + type_emb[0] + emb,
                binary=emb.endswith('.bin'))
            logger.info('Embedding %s loaded\n', emb)

            if type_emb[0] == '/cuis/':
                lab = 'cui'
                dict_labels_inters_vemb = None

            if type_emb[0] == '/words/':
                lab = 'labels'
                # Filter the dictionary of labels keeping only the labels-words present into the embedding
                Vemb = utils.extract_w2v_vocab(model)
                dict_labels_inters_vemb = umls_tables_processing.discarding_labels_oov(
                    Vemb, dict_conso, all_labels=all_labels)

            pcas_l = rela_direction(model,
                                    lab,
                                    ALL_RELAS,
                                    L_umls,
                                    logger,
                                    dict_=dict_labels_inters_vemb,
                                    all_labels=all_labels)
            pcas_k = rela_direction(model,
                                    lab,
                                    ALL_RELAS,
                                    K_umls,
                                    logger,
                                    dict_=dict_labels_inters_vemb,
                                    all_labels=all_labels)

            temp = distance_among_relations(pcas_l,
                                            pcas_k,
                                            RELAS,
                                            logger=logger,
                                            all_labels=all_labels)

            # Distance
            #temp_dict = distance_pairs_components(K_umls,
            #                                      pcas_l,
            #                                      model,
            #                                      ALL_RELAS,
            #                                      lab,
            #                                      dict_labels = dict_labels_inters_vemb,
            #                                      logger = logger,
            #                                      all_labels = all_labels)

            full_relations[emb] = temp
            #full_pairs[emb] = temp_dict

            # Here it can be implementable the distance_pairs_components method

        logger.info(
            'The time for computation of %s is %s\n', emb,
            str(datetime.datetime.now().replace(microsecond=0) - timer_emb))

    logger.info(
        'The time for global pcas distance computation is %s\n',
        str(datetime.datetime.now().replace(microsecond=0) - timer_global))
    return full_relations  #, full_pairs