def __init__(self,
                 path_source_language,
                 path_target_language,
                 train_translation_dict_path,
                 number_tokens=5000):
        """Proc and Proc-b Method Class.

        Args:
            path_source_language (path): Path to source Language.
            path_target_language (path): Path to target Language.
            train_translation_dict_path (path): Path to train translation dictionary.
            number_tokens (int): Number of tokens per language.
        """
        # Built Embeddings
        self.source_embedding_word, self.source_embedding_matrix = load_embedding(
            path_source_language, number_tokens)
        self.target_embedding_word, self.target_embedding_matrix = load_embedding(
            path_target_language, number_tokens)

        # Built train/test dictionary
        self.train_translation_source, self.train_translation_target = load_translation_dict(
            train_translation_dict_path)

        # Built Word to index map
        self.src_word2ind = {
            word: i
            for i, word in enumerate(self.source_embedding_word)
        }
        self.trg_word2ind = {
            word: i
            for i, word in enumerate(self.target_embedding_word)
        }

        # Built Index to Word map
        self.src_ind2word = {
            i: word
            for i, word in enumerate(self.source_embedding_word)
        }
        self.trg_ind2word = {
            i: word
            for i, word in enumerate(self.target_embedding_word)
        }

        # Normalize Embeddings
        self.norm_src_embedding_matrix = normalize_matrix(
            self.source_embedding_matrix)
        self.norm_trg_embedding_matrix = normalize_matrix(
            self.target_embedding_matrix)
    def test_simulated_gene_data(self):
        """ Test DE on a simulated gene expression matrix (w/ no biological variance) """
        np.random.seed(0)

        sim_mat, cell_type, sim_de = simulate_matrix()

        # get scale
        scale = np.array(sim_mat.sum(axis=0)).squeeze()
        depth = (scale + 1) / np.median(scale)
        cov = [np.log(depth)]

        # precompute distribution params
        ntfmatrix = normalize_matrix(sim_mat, scale)
        alpha = atac_de.empirical_dispersion(ntfmatrix)

        # sseq_params = cr_de.compute_sseq_params(sim_mat)
        # alpha = sseq_params['phi_g']

        de_res = atac_de.NBGLM_differential_expression(
            sim_mat,
            np.flatnonzero(cell_type == 0),
            np.flatnonzero(cell_type == 1),
            model='nb',
            test_params={
                'cov': cov,
                'alpha': alpha
            },
            verbose=False)

        sensitivity, ppv = evaluate_de_res(de_res, sim_de)

        assert sensitivity >= 0.94
        assert ppv >= 0.94
    def __init__(self,
                 inputs,
                 load_from=None,
                 rand_init_params=None,
                 gensim_w2v=None,
                 dic=None):
        '''rand_init_params: (rng, (voc_dim, emb_dim))
        '''
        self.inputs = inputs

        if load_from is not None:
            W_values = pickle.load(load_from)
        elif rand_init_params is not None:
            rng, (voc_dim, emb_dim) = rand_init_params
            W_values = rand_matrix(rng, 1, (voc_dim, emb_dim))

            if gensim_w2v is not None and dic is not None:
                assert gensim_w2v.vector_size == emb_dim

                n_sub = 0
                for idx, word in dic._idx2word.items():
                    if word in gensim_w2v.wv:
                        W_values[idx] = gensim_w2v.wv[word]
                        n_sub += 1
                print('Substituted words by word2vec: %d/%d' %
                      (n_sub, voc_dim))

            W_values = normalize_matrix(W_values)
        else:
            raise Exception('Invalid initial inputs!')

        self.W = theano.shared(value=W_values, name='emb_W', borrow=True)

        self.params = [self.W]
        self.outputs = self.W[inputs]
 def test_normalize_matrix_standard(self):
     """
     Implements unit tests related to the normalization of a matrix to the standard form.
     """
     matrix = numpy.array([[1.0, 2.0, 3.0, 4.0], [3.0, 4.0, 5.0, 6.0],
                           [5.0, 6.0, 7.0, 8.0]])
     matrix_array = numpy.ravel(matrix)
     normalized_matrix_array = preprocessing.scale(matrix_array)
     normalized_matrix = normalized_matrix_array.reshape(3, 4)
     new_matrix = utils.normalize_matrix(matrix, 0, 1)
     self.assertSequenceEqual(new_matrix.tolist(),
                              normalized_matrix.tolist())
 def test_normalize_matrix_generic(self):
     """
     Implements unit tests related to the normalization of a matrix
     given a certain mean and standard of deviation.
     """
     matrix = numpy.array([[1.0, 2.0, 3.0, 4.0], [3.0, 4.0, 5.0, 6.0],
                           [5.0, 6.0, 7.0, 8.0]])
     new_matrix = utils.normalize_matrix(matrix, 2, 4)
     new_mean = numpy.mean(numpy.ravel(new_matrix))
     new_std = numpy.std(numpy.ravel(new_matrix))
     self.assertAlmostEqual(new_mean, 2)
     self.assertAlmostEqual(new_std, 4)
Exemplo n.º 6
0
def word2vec():
    """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :embedding_dim          - size of embeddings
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
    """
    if verbose > 0:
        print_header_str('WORD2VEC')

    if (reuse_computed 
        and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy')
        and os.path.isfile(vocab_dir+vocab_file+'.pkl')):
        if verbose > 0:
            print('Reusing word2vec vocab:', vocab_file)
            print('Reusing word2vec embeddings:', selected_embeddings_file)
            print_header_str('DONE')
            print()
        return

    dataset=[]

    for fn in [tweet_dir+emb_train_tweets_pos, tweet_dir+emb_train_tweets_neg,tweet_dir+emb_test_tweets]:
        with open(fn) as f:
            for line in f:
                tokens = line.strip().split()
                dataset.append(tokens)
    
    model = Word2Vec(dataset, 
                size=embedding_dim, window=emb_context_window, 
                min_count=emb_word_min_count, workers=6, 
                iter=embedding_epochs, sg=1, compute_loss=True)
    
    X = model.wv.vectors
    if embedding_norm:
        X = normalize_matrix(X)

    np.save(embeddings_dir+selected_embeddings_file, X)

    vocab = dict()
    for idx, line in enumerate(model.wv.vocab):
        vocab[line.strip()] = idx
        
    with open(vocab_dir+vocab_file+'.pkl', 'wb') as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

    if verbose > 0:
        print('Vocabulary size:', len(vocab))
        print('Training loss:', model.get_latest_training_loss())
        print_header_str('DONE')
    print()
Exemplo n.º 7
0
def get_topology(drones, mean, std, nrows, ncolumns):
    """
    Computes a matrix topology given the locations of the drones 
    and the mean and standard deviation of the elements in the matrix
    """
    topology = np.zeros((nrows, ncolumns))
    for drone in drones:
        topology[int(drone[0])][int(drone[1])] = 1
    if settings.DISTANCE_ENCODING:
        topology = utils.sparse_to_distance(topology)
    if settings.NORMALIZE_DATA:
        topology = utils.normalize_matrix(topology, mean, std)
    return topology
Exemplo n.º 8
0
def save_model_embeddings(model, opts):
    """Save model embeddings."""

    dict_dir = opts.dict_dir
    embeddings = get_model_embeddings(model)
    if opts.normalize_embeddings:
        tf.logging.info("Normalize embeddings.")
        embeddings = utils.normalize_matrix(embeddings)
    embeddings[model_keys.PADDING_ID] = 0.0
    tf.logging.debug('save embeddings = \n{}'.format(embeddings))
    save_embeddings_path = os.path.join(dict_dir,
                                        model_keys.SAVE_EMBEDDINGS_NAME)
    np.save(save_embeddings_path, embeddings)
    def augment_dictionary(self, growth_rate, limit):
        """Augment the Dictionary based on Proc-B method.

        Args:
            growth_rate (int): Growth rate of augmented dictionary.
            limit (int): Augmented Dictionary Limit

        Returns:

        """
        # Find NN from projected source to (original) target embedding
        neighbors_projected_src_trg = find_nearest_neighbor(
            normalize_matrix(self.proj_embedding_source_target),
            self.norm_trg_embedding_matrix,
            use_batch=True)
        # Find NN from projected target embedding to (original) source embedding
        neighbors_projected_trg_src = find_nearest_neighbor(
            normalize_matrix(self.proj_embedding_target_source),
            self.norm_src_embedding_matrix,
            use_batch=True)
        # Find Matches
        matching = check_if_neighbors_match(neighbors_projected_src_trg,
                                            neighbors_projected_trg_src)
        # Make Sure that it does not grow fast
        rank_pairs = [[key, value] for key, value in matching.items()]
        cnt = min(int(growth_rate * len(self.train_translation_source)), limit)
        if cnt < len(rank_pairs):
            rank_pairs = rank_pairs[:cnt]
        # Update orignal Dictionary
        self.train_translation_source = [
            self.src_ind2word[source_index]
            for source_index in [pair[0] for pair in rank_pairs]
        ]
        self.train_translation_target = [
            self.trg_ind2word[target_index]
            for target_index in [pair[1] for pair in rank_pairs]
        ]
    def create_source_target_embedding(self,
                                       test_translation_dict_path,
                                       use_layer=11):
        """Create Embeddings for each word in the given dictonary (single words).

        Args:
            test_translation_dict_path: path to dictionary
            use_layer: Layer to take embeddings from

        Returns:

        """

        # Load Dictionary
        source_word_translation, target_word_translation = load_translation_dict(
            test_translation_dict_path)

        for source_index, source_word in tq.tqdm(
                enumerate(source_word_translation),
                total=len(source_word_translation)):
            # Word to index map
            self.src_word2ind[source_word] = source_index
            self.src_ind2word[source_index] = source_word
            embedding_each_term, _, _ = self.create_embedding_for_each_term(
                source_word, use_layer=use_layer)
            self.proj_embedding_source_target.append(
                embedding_each_term.squeeze())
            del embedding_each_term
            torch.cuda.empty_cache()

        for target_index, target_word in tq.tqdm(
                enumerate(target_word_translation),
                total=len(target_word_translation)):
            # Word to index map
            self.trg_word2ind[target_word] = target_index
            self.trg_ind2word[target_index] = target_word
            embedding_each_term, _, _ = self.create_embedding_for_each_term(
                target_word, use_layer=use_layer)
            self.target_embedding_matrix.append(embedding_each_term.squeeze())
            del embedding_each_term
            torch.cuda.empty_cache()

        self.proj_embedding_source_target = np.array(
            self.proj_embedding_source_target)
        self.target_embedding_matrix = np.array(self.target_embedding_matrix)
        self.norm_trg_embedding_matrix = normalize_matrix(
            self.target_embedding_matrix)
Exemplo n.º 11
0
def save_model_nce_params(model, opts):
    """Save model nce weights and biases variables."""

    dict_dir = opts.dict_dir
    nce_weights, nce_biases = get_model_nce_weights_and_biases(model)

    if opts.normalize_nce_weights:
        tf.logging.info("Normalize nce weihts.")
        nce_weights = utils.normalize_matrix(nce_weights)

    tf.logging.debug('save nce_weights = \n{}'.format(nce_weights))
    tf.logging.debug('save nce_biases = \n{}'.format(nce_biases))
    save_weights_path = os.path.join(dict_dir,
                                     model_keys.SAVE_NCE_WEIGHTS_NAME)
    save_biases_path = os.path.join(dict_dir, model_keys.SAVE_NCE_BIASES_NAME)
    np.save(save_weights_path, nce_weights)
    np.save(save_biases_path, nce_biases)
Exemplo n.º 12
0
def main(args, outs):
    """Run this for each method x clustering key combination from split"""
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        return

    # Load the peak-BC matrix and a clustering and perform DE
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix)
    clustering_h5 = args.clustering_summary['h5'][args.method]
    clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, args.clustering_key)
    mask = clustering.clusters == args.cluster
    clustering.clusters[mask] = 1
    clustering.clusters[np.logical_not(mask)] = 2

    # find depth using peak matrix and normalize
    scale = np.array(peak_matrix.m.sum(axis=0)).squeeze()
    depth = (scale + 1) / np.median(scale)

    cov_peak = [np.log(depth)]
    diffexp_peak = nb2_diffexp.run_differential_expression(peak_matrix.m, clustering.clusters, model='poisson',
                                                           impute_rest=True, test_params={'cov': cov_peak}, verbose=True)

    # find empirical estimates of alpha
    tf_matrix = None
    diffexp_tf = None
    # do DE on tf-BC matrix
    if args.filtered_tf_bc_matrix is not None:
        tf_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_tf_bc_matrix)
        ntfmatrix = normalize_matrix(tf_matrix.m, scale)
        alpha_tf = nb2_diffexp.empirical_dispersion(ntfmatrix)
        barcode_GC = get_barcode_gc(args.reference_path, args.peaks, peak_matrix)
        cov_tf = [barcode_GC, np.log(depth)]
        diffexp_tf = nb2_diffexp.run_differential_expression(tf_matrix.m, clustering.clusters, model='nb', impute_rest=True,
                                                             test_params={'cov': cov_tf, 'alpha': alpha_tf}, verbose=True)

    # vstack
    diffexp = diffexp_peak if tf_matrix is None else cr_diffexp.DIFFERENTIAL_EXPRESSION(np.vstack([diffexp_peak.data, diffexp_tf.data]))

    # write out temp file
    np.savetxt(outs.tmp_diffexp, diffexp.data, delimiter=',')
    outs.enrichment_analysis = None
    outs.enrichment_analysis_summary = None
Exemplo n.º 13
0
def build_input_structure(scenarios, results, list_scenarios, list_topologies):
    """
    Function that constructs a structure that should be given as an input to our machine learning model.
    """
    model_struct_orig = []
    model_prediction = []
    #first the topologies without any transformation
    index_scenario = 0
    for scenario_id in list_scenarios:
        scenario_begin_index = (scenario_id - 1) * settings.SCENARIO_TOPOLOGIES_NO
        scenario_matrix = datarate_matrix(scenarios[scenario_id - 1])
        if(settings.NORMALIZE_DATA):
            mean, std = utils.stats_matrix(scenario_matrix)
        for topology_id in list_topologies[index_scenario]:
            index_results = scenario_begin_index + topology_id - 1
            topology_matrix, qualities_list = drones_matrix(results.loc[index_results])
            if(settings.DISTANCE_ENCODING):
                topology_matrix = utils.sparse_to_distance(topology_matrix)
            if(settings.NORMALIZE_DATA):
                topology_matrix = utils.normalize_matrix(topology_matrix, mean, std)
            model_struct_orig.append([scenario_matrix, topology_matrix])
            model_prediction.append(qualities_list)
        index_scenario += 1
    if(settings.USE_TRANSFORMATIONS):
        #Then the topologies with a 90 rotation
        model_struct_rot1, model_pred_rot1 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.rotate, 90)
        #Then the topologies with a 180 rotation
        model_struct_rot2, model_pred_rot2 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.rotate, 180)
        #Then the topologies with a 270 rotation
        model_struct_rot3, model_pred_rot3 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.rotate, 270)
        #Then the topologies with a symmetry over the 0 axis
        model_struct_sym1, model_pred_sym1 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 0)
        #Then the topologies with a symmetry over the 45 axis
        model_struct_sym2, model_pred_sym2 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 45)
        #Then the topologies with a symmetry over the 90 axis
        model_struct_sym3, model_pred_sym3 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 90)
        #Then the topologies with a symmetry over the 135 axis
        model_struct_sym4, model_pred_sym4 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 135)

        return np.array(model_struct_orig + model_struct_rot1 + model_struct_rot2 + model_struct_rot3 + model_struct_sym1 + model_struct_sym2 + model_struct_sym3 + model_struct_sym4),(model_prediction + model_pred_rot1 + model_pred_rot2 + model_pred_rot3 + model_pred_sym1 + model_pred_sym2 + model_pred_sym3 + model_pred_sym4)
    else:
        return np.array(model_struct_orig), model_prediction      
Exemplo n.º 14
0
 def normalize_matrix(self):
     utils.normalize_matrix(self.matrix)
Exemplo n.º 15
0
def auto_overlap(prex=None,
                 graph_name=None,
                 emb_method_name1=None,
                 emb_method_name2=None,
                 binNum=None):
    time_start = time.time()
    print('----------------------------------------------------------')
    print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 +
          "," + emb_method_name2)

    results_base_dir = 'D:\hybridrec//results//'
    all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex
    results_dir = 'D:\hybridrec/results//' + prex
    graph_results_dir = results_dir + graph_name + '//'

    path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat"
    path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat"

    if not (os.path.exists(path_scores_method1)
            and os.path.exists(path_scores_method2)):
        print("dataset: " + graph_name + '----' + "baselines:" +
              emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算')

    if os.path.exists(path_scores_method1) and os.path.exists(
            path_scores_method2):
        # 获取归一化分数
        scores_matrix_one_dict = (loadmat(path_scores_method1))
        scores_matrix_two_dict = (loadmat(path_scores_method2))
        scores_matrix_one = scores_matrix_one_dict['scores']
        scores_matrix_two = scores_matrix_two_dict['scores']
        if emb_method_name1 not in all_embedding_methods:
            scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A,
                                                   k=1))  # k=1表示不包括对角线
        if emb_method_name2 not in all_embedding_methods:
            scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1))
        scores_matrix_one_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_one))  # 去掉传参的csr_matrix()则会
        scores_matrix_two_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_two))

        # 获取train_binary和test_binary
        graph_train_path = get_trainset_path(base_dir=all_file_dir,
                                             graph_name=graph_name,
                                             connected_pattern='undirected',
                                             from_zeros_one='0')
        graph_test_path = get_testset_path(base_dir=all_file_dir,
                                           graph_name=graph_name)
        G = read_graph(weighted=0, input=graph_train_path, directed=0)
        train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G))
        train_binary = csr_matrix(np.triu(train_binary.A, k=1))
        test_binary = get_test_matrix_binary(graph_test_path=graph_test_path,
                                             N=train_binary.shape[0])

        # 读取plus的原始分数(未归一化)
        plus_scores_name = 'plus_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat'
        plus_scores_path = graph_results_dir + plus_scores_name
        scores_matrix_plus_dict = (loadmat(plus_scores_path))
        scores_matrix_plus = scores_matrix_plus_dict['scores']

        # 读取multiply的原始分数(未归一化)
        multiply_scores_name = 'multiply_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat'
        multiply_scores_path = graph_results_dir + multiply_scores_name
        scores_matrix_multiply_dict = (loadmat(multiply_scores_path))
        scores_matrix_multiply = scores_matrix_multiply_dict['scores']

        # 读取MLP的原始分数(未归一化)
        mlp_scores_name = 'mlp_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat'
        mlp_scores_path = graph_results_dir + mlp_scores_name
        scores_matrix_mlp_dict = (loadmat(mlp_scores_path))
        scores_matrix_mlp = scores_matrix_mlp_dict['scores']

        # 归一化hybrid分数
        scores_matrix_plus_norm = normalize_matrix(
            csr_matrix1=scores_matrix_plus)
        scores_matrix_multiply_norm = normalize_matrix(
            csr_matrix1=scores_matrix_multiply)
        scores_matrix_mlp_norm = normalize_matrix(
            csr_matrix1=scores_matrix_mlp)

        # 计算plus、multiply、mlp、PNR的rasterization grids
        mlp_path = results_base_dir + prex + graph_name + "//" + "mlp_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        mlp_dict = (loadmat(mlp_path))
        mlp_raster_grids = mlp_dict["count"]
        multiply_path = results_base_dir + prex + graph_name + "//" + "multiply_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        multiply_dict = (loadmat(multiply_path))
        multiply_raster_grids = multiply_dict["count"]
        plus_path = results_base_dir + prex + graph_name + "//" + "plus_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        plus_dict = (loadmat(plus_path))
        plus_raster_grids = plus_dict["count"]

        # plus_raster_grids = rasterization_grids(binNum=binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_plus_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        # multiply_raster_grids = rasterization_grids(binNum=binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_multiply_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        # mlp_raster_grids = rasterization_grids(binNum=binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_mlp_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        PNR_path = results_base_dir + prex + graph_name + "//" + "PNR2_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        PNR_dict = (loadmat(PNR_path))
        PNR_raster_grids = PNR_dict["count"]

        exist_binary = csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
        nonexist_binary = csr_matrix(
            np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A)
        # 获取plus的nonexist_scores_list
        nonexist_scores_plus_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=plus_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)
        # 获取multiply的nonexist_scores_list
        nonexist_scores_multiply_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=multiply_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)
        # 获取mlp的nonexist_scores_list
        nonexist_scores_mlp_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=mlp_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)
        # 获取PNR的nonexist_scores_list
        nonexist_scores_PNR_list = transfer_scores_PNR(
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            train_binary=train_binary,
            PNR=PNR_raster_grids,
            interval=float((1.0 - 0.0) / binNum),
            binNum=binNum)

        # 获取阈值
        E_test = np.sum(test_binary.A)
        thresold_plus = get_list_thresold(nonexist_scores_plus_list, L=E_test)
        thresold_multiply = get_list_thresold(nonexist_scores_multiply_list,
                                              L=E_test)
        thresold_mlp = get_list_thresold(nonexist_scores_mlp_list, L=E_test)
        thresold_PNR = get_list_thresold(nonexist_scores_PNR_list, L=E_test)

        # 这里的trick, L=1/2 |E_test|!!!!!!!!!!!
        # thresold_plus = int(thresold_plus*0.5)
        # thresold_multiply = int(thresold_multiply * 0.5)
        # thresold_mlp = int(thresold_mlp * 0.5)
        # thresold_PNR = int(thresold_PNR * 0.5)

        # 修改grids
        plus_raster_grids = plus_raster_grids.A
        multiply_raster_grids = multiply_raster_grids.A
        mlp_raster_grids = mlp_raster_grids.A
        PNR_raster_grids = PNR_raster_grids.A
        # np.where(plus_raster_grids > thresold_plus, plus_raster_grids, 0)
        # np.where(multiply_raster_grids > thresold_multiply, multiply_raster_grids, 0)
        # np.where(mlp_raster_grids > thresold_mlp, mlp_raster_grids, 0)
        # np.where(PNR_raster_grids > thresold_PNR, PNR_raster_grids, 0)
        plus_raster_grids[plus_raster_grids <= thresold_plus] = 0.0
        multiply_raster_grids[multiply_raster_grids <= thresold_multiply] = 0.0
        mlp_raster_grids[mlp_raster_grids <= thresold_mlp] = 0.0
        PNR_raster_grids[PNR_raster_grids <= thresold_PNR] = 0.0

        plus_raster_grids[plus_raster_grids >= thresold_plus] = 1.0
        multiply_raster_grids[multiply_raster_grids >= thresold_multiply] = 1.0
        mlp_raster_grids[mlp_raster_grids >= thresold_mlp] = 1.0
        PNR_raster_grids[PNR_raster_grids >= thresold_PNR] = 1.0

        # 画图
        # colors = ['OrangeRed', 'darkseagreen', 'dodgerblue', 'blueviolet']
        colors = ['Red', 'green', 'blue', 'purple']
        result = np.float32(PNR_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0
        title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[0])

        result = np.float32(plus_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0
        title = graph_name + '-plus-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[1])

        result = np.float32(multiply_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0
        title = graph_name + '-multiply-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[2])

        result = np.float32(mlp_raster_grids)
        result = cv2.GaussianBlur(result, (5, 5),
                                  0)  # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0
        title = graph_name + '-mlp-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf_overlap(result=result, title=title, color=colors[3])

        # # 计算plus的rasterization grids
        # plus_raster_grids = rasterization_grids(binNum=plus_binNum,
        #                                        train_binary=train_binary,
        #                                        scores_matrix_DNN=scores_matrix_plus_norm,
        #                                        scores_matrix_one_norm=scores_matrix_one_norm,
        #                                        scores_matrix_two_norm=scores_matrix_two_norm)
        # # plus_raster_grids = np.log10(plus_raster_grids) # 出现-inf而报错
        # plus_raster_grids = normalize_matrix_full(csr_matrix1=csr_matrix(plus_raster_grids))
        # plus_raster_grids = better_show_grids(csr_matrix1=plus_raster_grids)
        #
        # source = np.float32(plus_raster_grids.A)
        # result = cv2.GaussianBlur(source, (5, 5), 0)
        # title = graph_name + '-' + 'plus' +'-' + emb_method_name1 + '-' + emb_method_name2
        # plot_contourf(result=result, title=title, binNum=10)
        #

        time_end = time.time()
        print("It takes : " + str((time_end - time_start) / 60.0) + "  mins.")
        pass
def clew_induction(path_source_language,
                   path_target_language,
                   train_translation_dict_path,
                   train_translation_dict_1k_path,
                   test_translation_dict_path,
                   new_test_translation_path,
                   name_translation,
                   number_tokens=100000,
                   save_embedding=False):
    """Induce Cross Lingual Word Embeddings (Proc, Proc-B, VecMap) and Evaluate them on BLI task.

    Args:
        path_source_language (path): Path to Source Embedding.
        path_target_language (path): Path to Target Embedding.
        train_translation_dict_path (path): Path to Translation dictionary 5k
        train_translation_dict_1k_path (path): Path to Translation dictionary 1k
        test_translation_dict_path (path): Path to Translation test dictionary
        new_test_translation_path (path): Path to Translation test dictionary
        name_translation (str): name of saved files
        number_tokens (int): number of tokens used for monolingual word embeddings
        save_embedding (boolean): To save or not save the created CLWE

    Returns:

    """

    print(
        "\nFirst, we cut the test dictionaries to the monolingual vocabularies:"
    )
    cut_dictionary_to_vocabulary(path_source_language,
                                 path_target_language,
                                 test_translation_dict_path,
                                 new_test_translation_path,
                                 number_tokens=number_tokens)

    test_translation_dict_path = new_test_translation_path

    # PROC - 5K dictionary
    print("--------------------------------")
    print("\nCreate procrustes model with 5000 translation pairs")
    proc_algorithm = Projection_based_clwe(path_source_language,
                                           path_target_language,
                                           train_translation_dict_path,
                                           number_tokens=number_tokens)

    proc_algorithm.proc(source_to_target=True)
    Evaluator(proc_algorithm, test_translation_dict_path).evaluation_on_BLI()
    if save_embedding:
        save_clew(proc_algorithm, name_translation + "_proc_5k")
    del proc_algorithm

    # PROC - 1K dictionary
    print("--------------------------------")
    print("\nCreate procrustes model with 1000 translation pairs")
    proc_algorithm = Projection_based_clwe(path_source_language,
                                           path_target_language,
                                           train_translation_dict_1k_path,
                                           number_tokens=number_tokens)

    proc_algorithm.proc(source_to_target=True)
    Evaluator(proc_algorithm, test_translation_dict_path).evaluation_on_BLI()
    if save_embedding:
        save_clew(proc_algorithm, name_translation + "_proc_1k")
    del proc_algorithm

    # PROC-B - 1K dictionary
    print("--------------------------------")
    print(
        "\nCreate procrustes bootstrapping model with 1000 translation pairs")
    proc_b_algorithm = Projection_based_clwe(path_source_language,
                                             path_target_language,
                                             train_translation_dict_1k_path,
                                             number_tokens=number_tokens)

    proc_b_algorithm.proc_bootstrapping(growth_rate=1.5, limit=10000)
    Evaluator(proc_b_algorithm, test_translation_dict_path).evaluation_on_BLI()
    if save_embedding:
        save_clew(proc_b_algorithm, name_translation + "_proc_b_1k")
    del proc_b_algorithm

    # Unsupervised VecMap
    print("--------------------------------")
    print("\nCreate VecMap model")
    vec_map = VecMap(path_source_language,
                     path_target_language,
                     number_tokens=100000)
    # Please use GPU if available and install cupy
    use_gpu = True
    vec_map.build_seed_dictionary(use_gpu)
    vec_map.training_loop(use_gpu)
    Evaluator(vec_map, test_translation_dict_path).evaluation_on_BLI()
    if save_embedding:
        vec_map.proj_embedding_source_target = normalize_matrix(
            vec_map.proj_embedding_source_target)
        vec_map.target_embedding_matrix = vec_map.norm_trg_embedding_matrix
        save_clew(vec_map, name_translation + "_vecmap")
    del vec_map

    # Text Encoder First Layer
    print("--------------------------------")
    print("\nCreate  Text Encoder First Layer model")
    xlm_r = TextEncoders("xlm-r")
    xlm_r.create_source_target_embedding(test_translation_dict_path,
                                         use_layer=1)
    Evaluator(xlm_r, test_translation_dict_path).evaluation_on_BLI()
    del xlm_r

    # Text Encoder Last Layer
    print("--------------------------------")
    print("\nCreate  Text Encoder Last Layer model")
    xlm_r_last_layer = TextEncoders("xlm-r")
    xlm_r_last_layer.create_source_target_embedding(test_translation_dict_path,
                                                    use_layer=12)
    Evaluator(xlm_r_last_layer, test_translation_dict_path).evaluation_on_BLI()
    del xlm_r_last_layer
Exemplo n.º 17
0
                                         connected_pattern='undirected',
                                         from_zeros_one='0')
    G = read_graph(weighted=0, input=graph_train_path, directed=0)
    train_binary = sp.csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G))
    train_binary = sp.csr_matrix(np.triu(train_binary.A, k=1))
    # train_binary_full = train_binary.A
    # 或 train_binary = sp.csr_matrix(np.array(nx.to_numpy_matrix(G)))



    # 构建exist和nonexist的binary
    exist_binary = sp.csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线
    nonexist_binary = sp.csr_matrix(np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A)

    # 分数归一化到[0.0, 1.0]
    scores_matrix_one_norm = normalize_matrix(csr_matrix1 = scores_matrix_one)
    scores_matrix_two_norm = normalize_matrix(csr_matrix1 = scores_matrix_two)
    # plot_matrix(scores_matrix_one_norm.A)
    # plot_matrix(scores_matrix_two_norm.A)

    del scores_matrix_one, scores_matrix_two
    gc.collect()



    # 划分bin
    val_max = 1.0
    val_min = 0.0
    # bin_array = sorted(divide_bin(val_max = val_max, val_min = val_min, binNum = binNum))
    interval = float((val_max - val_min) / binNum)
Exemplo n.º 18
0
def glove():
    """Computes GloVe embeddings given a vocabulary and a corresponding cooccurrence matrix.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :embedding_dim          - size of embeddings
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
        :glove_polarization     - polarization factor for embedding initialization (with rel. freq)
    """
    if verbose > 0:
        print_header_str('EMBEDDINGS')
    
    if reuse_computed and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy'):
        if verbose > 0:
            print('Reusing embeddings:', selected_embeddings_file)
            print_header_str('DONE')
            print()
        return

    if verbose > 0:
        print("Loading cooccurrence matrix.")

    with open(vocab_dir+cooc_file+'.pkl', 'rb') as f:
        cooc = pickle.load(f)

    nmax = 100
    
    if verbose > 0:
        print("\tUsing nmax =", nmax, ", with cooc.max() =", cooc.max(),end='\n\n')

        print("Initializing embeddings with U~[-.5,.5] distribution: ", 
            (cooc.shape[0], embedding_dim+1),
            (cooc.shape[1], embedding_dim+1), flush=True, end='\n\n')
    
    xs = np.random.uniform(size=(cooc.shape[0], embedding_dim+1)) - .5
    ys = np.random.uniform(size=(cooc.shape[1], embedding_dim+1)) - .5

    xs /= (embedding_dim+1)
    ys /= (embedding_dim+1)
    # Bias term is incorporated in word embedding
    xs[:,embedding_dim] = 1
    ys[:,embedding_dim-1] = 1
    
    if glove_polarization > 0:
        if verbose > 0:
            print('Adding polarization to random initial embeddings. Factor:', glove_polarization, end='\n\n')
        ### Get bias for positive and negative words ###
        vocab_pos = pickle.load(open(tweet_dir+emb_polar_vocab.format('pos'), 'rb'))
        vocab_neg = pickle.load(open(tweet_dir+emb_polar_vocab.format('neg'), 'rb'))
        polarization = sentiment_polarization(vocab_pos, vocab_neg)

        vocab = pickle.load(open(vocab_dir+vocab_file+'.pkl', 'rb'))

        ############### Add polarization ################
        split = (embedding_dim-1)//2
        for word,id in vocab.items():
            if word in polarization:
                polar = polarization[word]
            else:
                polar = .5
            xs[id,:split] += glove_polarization*polar / (embedding_dim+1)
            xs[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1)
            ys[id,:split] += glove_polarization*polar / (embedding_dim+1)
            ys[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1)
        #################################################
    
    eta = 0.05
    alpha = 3 / 4

    prev_loss = 0.0

    data = [(i,j,n) for i,j,n in zip(cooc.row,cooc.col, cooc.data)]

    for ix, jy, n in data:
            w = min( 1., (n/nmax)**alpha )
            x,y = xs[ix], ys[jy]
            increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) )

            x_upd = xs[ix] + increase_mul*y
            y_upd = ys[jy] + increase_mul*x

            prev_loss += w * ( log(n) - np.dot(x_upd, y_upd) )**2
    
    for epoch in range(embedding_epochs):
        loss = 0.0
        random.shuffle(data)

        if verbose == 1:
            print_progress_bar(0,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss))
        counter,missed_updates=0,0
        for ix, jy, n in data:
            counter+=1
            w = min( 1., (n/nmax)**alpha )
            x,y = xs[ix], ys[jy]
            increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) )

            x_upd = xs[ix] + increase_mul*y
            y_upd = ys[jy] + increase_mul*x

            loss_delta = w * ( log(n) - np.dot(x_upd, y_upd) )**2

            # Undo the current update
            if (np.isnan(x_upd).any() or np.isinf(x_upd).any() or
                    np.isnan(y_upd).any() or np.isinf(y_upd).any() or
                    np.isnan(loss+loss_delta) or np.isinf(loss+loss_delta)):
                missed_updates += 1
                loss += w * ( log(n) - np.dot(xs[ix], ys[jy]) )**2
                if (counter % 5000 == 0 or counter == len(data)) and verbose == 1:
                    print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss))
                continue

            xs[ix] = x_upd
            ys[jy] = y_upd

            # Reset bias
            xs[ix,embedding_dim] = 1
            ys[jy,embedding_dim-1] = 1

            loss += loss_delta
            if (counter % 50000 == 0 or counter == len(data)) and verbose == 1:
                print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss))

        ### BOLD DRIVER LEARNING RATE ###
        if prev_loss > loss or epoch==0:
            eta += 0.01*eta
        else:
            eta /= 2
        prev_loss = loss
        #################################

        if verbose > 0:
            print("Epoch {:2d} loss : {:10.2f}".format(epoch+1, loss))
            print('Missed {:4d} updates due to overflow prevention'.format(missed_updates))
            print('Current learning rate: {:1.3f}'.format(eta), end='\n\n', flush=True)

        if (epoch+1) % 10 == 0 and epoch+1 != embedding_epochs:
            X = xs[:,:embedding_dim]
            if embedding_norm:
                X = normalize_matrix(X)
            np.save(embeddings_dir+glove_embedding_file_suffix(epoch+1), X)
    
    # Note: the bias for xs is in position embedding_dim-1
    X = xs[:,:embedding_dim]
    if embedding_norm:
        X = normalize_matrix(X)
    np.save(embeddings_dir+glove_embedding_file_suffix(embedding_epochs), X)

    if verbose > 0:
        print_header_str('DONE')
        print()
Exemplo n.º 19
0
def auto_PNR(prex=None,
             graph_name=None,
             emb_method_name1=None,
             emb_method_name2=None):

    print('----------------------------------------------------------')
    time_start = time.time()
    # 初始化训练集和测试集的路径
    # prex = 'preprocessing_code2//'  # 改这里
    all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex

    binNum = 50  # 改这里

    emb_method_name1 = emb_method_name1.lower()  # 改这里
    emb_method_name2 = emb_method_name2.lower()  # 改这里
    print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 +
          "," + emb_method_name2)
    conf_method1 = None
    conf_method2 = None
    if emb_method_name1 in all_embedding_methods:
        config_path_method1 = 'conf/' + emb_method_name1 + '.properties'
        config_method1 = configparser.ConfigParser()
        config_method1.read(config_path_method1)
        conf_method1 = dict(config_method1.items("hyperparameters"))
    if emb_method_name2 in all_embedding_methods:
        config_path_method2 = 'conf/' + emb_method_name2 + '.properties'
        config_method2 = configparser.ConfigParser()
        config_method2.read(config_path_method2)
        conf_method2 = dict(config_method2.items("hyperparameters"))

    # 初始化embedding和scores的路径
    results_dir = 'D:\hybridrec/results//' + prex
    graph_results_dir = results_dir + graph_name + '//'

    # 计算emb method 1
    if not ((emb_method_name1 == 'arope') or
            (emb_method_name1 == 'graph2gauss') or
            (is_heuristic_method(emb_method_name1) == True)):
        graph_train_path = get_trainset_path(
            base_dir=all_file_dir,
            graph_name=graph_name,
            connected_pattern=get_connp(emb_method_name1),
            from_zeros_one=get_from_zeros_one(emb_method_name1))
        graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name1 + '.emb'
        if not os.path.isfile(graph_results_path):
            run_emb_method(input=graph_train_path,
                           output=graph_results_path,
                           emb_method_name=emb_method_name1)

    # 计算emb method 2
    if not ((emb_method_name2 == 'arope') or
            (emb_method_name2 == 'graph2gauss') or
            (is_heuristic_method(emb_method_name2) == True)):
        graph_train_path = get_trainset_path(
            base_dir=all_file_dir,
            graph_name=graph_name,
            connected_pattern=get_connp(emb_method_name2),
            from_zeros_one=get_from_zeros_one(emb_method_name2))
        graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name2 + '.emb'
        if not os.path.isfile(graph_results_path):
            run_emb_method(input=graph_train_path,
                           output=graph_results_path,
                           emb_method_name=emb_method_name2)

    # 计算scores1
    if conf_method1 != None:
        embedding_size_method1 = int(conf_method1['embedding_size'])
    if emb_method_name1 == 'splitter':
        scores_matrix_one = inner_product_scores_splitter(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name1 == 'attentionwalk') or (emb_method_name1
                                                   == 'grarep'):
        scores_matrix_one = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name1 == 'drne') or (emb_method_name1 == 'prune'):
        scores_matrix_one = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1,
            skiprows=0,
            delimiter=' ')  # embedding_size_method有一些是要+1有一些不需要的
    elif (emb_method_name1 == 'arope'):
        scores_matrix_one = inner_product_scores_arope(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif (emb_method_name1 == 'graph2gauss'):
        scores_matrix_one = energy_kl_scores_graph2gauss(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif is_heuristic_method(emb_method_name1):
        scores_matrix_one = heuristic_scores(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir,
            heuristic_method=emb_method_name1)
    else:
        scores_matrix_one = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name1,
            col_start=0,
            col_end=embedding_size_method1 + 1,
            skiprows=1,
            delimiter=' ')

    # 计算scores2
    if conf_method2 != None:
        embedding_size_method2 = int(conf_method2['embedding_size'])
    if emb_method_name2 == 'splitter':
        scores_matrix_two = inner_product_scores_splitter(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name2 == 'attentionwalk') or (emb_method_name2
                                                   == 'grarep'):
        scores_matrix_two = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2 + 1,
            skiprows=1,
            delimiter=',')
    elif (emb_method_name2 == 'drne') or (emb_method_name2 == 'prune'):
        scores_matrix_two = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2,
            skiprows=0,
            delimiter=' ')
    elif (emb_method_name2 == 'arope'):
        scores_matrix_two = inner_product_scores_arope(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif (emb_method_name2 == 'graph2gauss'):
        scores_matrix_two = energy_kl_scores_graph2gauss(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir)
    elif is_heuristic_method(emb_method_name2):
        scores_matrix_two = heuristic_scores(
            all_file_dir=all_file_dir,
            graph_name=graph_name,
            graph_results_dir=graph_results_dir,
            heuristic_method=emb_method_name2)
    else:
        scores_matrix_two = inner_product_scores(
            graph_results_dir=graph_results_dir,
            dataset_name=graph_name,
            emb_method_name=emb_method_name2,
            col_start=0,
            col_end=embedding_size_method2 + 1,
            skiprows=1,
            delimiter=' ')

    # scores取上三角(注意:1、前面需要保证所有的分数在右上角或占满整个矩阵。2、前面有些是右上角,有些是占满整个矩阵)
    # scores_matrix_one_full = scores_matrix_one.A
    # scores_matrix_two_full = scores_matrix_two.A
    # plot_matrix(matrix = scores_matrix_one_full)
    # plot_matrix(matrix = scores_matrix_two_full)
    scores_matrix_one = sp.csr_matrix(np.triu(scores_matrix_one.A,
                                              k=1))  # k=1表示不包括对角线
    scores_matrix_two = sp.csr_matrix(np.triu(scores_matrix_two.A, k=1))

    # 读入train的binary数据
    graph_train_path = get_trainset_path(base_dir=all_file_dir,
                                         graph_name=graph_name,
                                         connected_pattern='undirected',
                                         from_zeros_one='0')
    G = read_graph(weighted=0, input=graph_train_path, directed=0)
    train_binary = sp.csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G))
    train_binary = sp.csr_matrix(np.triu(train_binary.A, k=1))
    # train_binary_full = train_binary.A
    # 或 train_binary = sp.csr_matrix(np.array(nx.to_numpy_matrix(G)))

    # 构建exist和nonexist的binary
    exist_binary = sp.csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
    nonexist_binary = sp.csr_matrix(
        np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A)

    # 分数归一化到[0.0, 1.0]
    scores_matrix_one_norm = normalize_matrix(csr_matrix1=scores_matrix_one)
    scores_matrix_two_norm = normalize_matrix(csr_matrix1=scores_matrix_two)
    # plot_matrix(scores_matrix_one_norm.A)
    # plot_matrix(scores_matrix_two_norm.A)

    del scores_matrix_one, scores_matrix_two
    gc.collect()

    # 划分bin
    val_max = 1.0
    val_min = 0.0
    # bin_array = sorted(divide_bin(val_max = val_max, val_min = val_min, binNum = binNum))
    interval = float((val_max - val_min) / binNum)

    # 获取exist_binary和nonexist_binary的分数
    exist_scores_one_list = (np.array(scores_matrix_one_norm[exist_binary > 0],
                                      dtype=float))[0]
    nonexist_scores_one_list = (np.array(
        scores_matrix_one_norm[nonexist_binary > 0], dtype=float))[0]
    exist_scores_two_list = (np.array(scores_matrix_two_norm[exist_binary > 0],
                                      dtype=float))[0]
    nonexist_scores_two_list = (np.array(
        scores_matrix_two_norm[nonexist_binary > 0], dtype=float))[0]
    # # 变为稀疏矩阵
    # exist_scores_one_list_csr = sp.csr_matrix(exist_scores_one_list)
    # nonexist_scores_one_list_csr = sp.csr_matrix(nonexist_scores_one_list)
    # exist_scores_two_list_csr = sp.csr_matrix(exist_scores_two_list)
    # nonexist_scores_two_list_csr = sp.csr_matrix(nonexist_scores_two_list)

    # temp = scores_matrix_one_norm[exist_binary > 0][0] # 我怕在把分数变为list的时候出问题

    # 初始化两个大小为binNum* bnNum的二维栅格
    exist_raster_grids = np.zeros((binNum, binNum))
    nonexist_raster_grids = np.zeros((binNum, binNum))

    # 计算落在exist_raster_grids栅格的existing links的数量
    exist_links_num = len(exist_scores_one_list)
    exist_row_col_zero_num = 0  # 那些两个矩阵的分数都是0的不作统计
    for i in range(exist_links_num):
        # row_index和col_index的范围从0-->binNum-1
        if (exist_scores_one_list[i] == 0.0) & (exist_scores_two_list[i]
                                                == 0.0):
            exist_row_col_zero_num = exist_row_col_zero_num + 1
            continue
        row_index = int(
            get_row_col_index(score=exist_scores_one_list[i],
                              interval=interval,
                              binNum=binNum))
        col_index = int(
            get_row_col_index(score=exist_scores_two_list[i],
                              interval=interval,
                              binNum=binNum))
        exist_raster_grids[row_index,
                           col_index] = exist_raster_grids[row_index,
                                                           col_index] + 1

    print("exist_row_col_zero_num:" + str(exist_row_col_zero_num))
    print('sum  exist_raster_grids:' + str(np.sum(exist_raster_grids)))

    # 计算落在nonexist_raster_grids栅格的nonexisting links的数量
    nonexist_links_num = len(nonexist_scores_one_list)
    nonexist_row_col_zero_num = 0  # 那些两个矩阵的分数都是0的不作统计
    for i in range(nonexist_links_num):
        # row_index和col_index的范围从0-->binNum-1
        if (nonexist_scores_one_list[i] <= 0.0) & (nonexist_scores_two_list[i]
                                                   <= 0.0):
            nonexist_row_col_zero_num = nonexist_row_col_zero_num + 1
            continue
        row_index = int(
            get_row_col_index(score=nonexist_scores_one_list[i],
                              interval=interval,
                              binNum=binNum))
        col_index = int(
            get_row_col_index(score=nonexist_scores_two_list[i],
                              interval=interval,
                              binNum=binNum))

        nonexist_raster_grids[row_index,
                              col_index] = nonexist_raster_grids[row_index,
                                                                 col_index] + 1

    print("nonexist_row_col_zero_num:" + str(nonexist_row_col_zero_num))
    print('sum  nonexist_raster_grids:' + str(np.sum(nonexist_raster_grids)))

    # 计算PNR分数
    N = train_binary.shape[0]
    print("Graph size:" + str(N) + '\n')
    L_T = np.sum(train_binary.A)
    O = N * (N - 1) / 2
    coefficient = (O - L_T) / L_T
    PNR1 = coefficient * (exist_raster_grids / (nonexist_raster_grids + 1)
                          )  # 分母加1避免出现inf或nan,不影响evaluation但是可能好看
    PNR2 = (exist_raster_grids / nonexist_raster_grids)  # inf和nan置为0
    PNR2[np.isnan(PNR2)] = 0
    PNR2[np.isinf(PNR2)] = 0
    PNR2 = coefficient * PNR2

    # 画图(注意:图的横纵坐标是从左上角开始的而不是想象中的左上角)
    # sns.heatmap(PNR1, cmap='Reds')
    # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_' +'bin_' + str(binNum) + "_PNR1.jpg")
    # plt.show()
    # sns.heatmap(PNR2, cmap='Reds')
    # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_'+ 'bin_' + str(binNum) + "_PNR2.jpg")
    # plt.show()
    # plt.matshow(PNR1) # 好丑
    # plt.show()

    # 保存(exist_raster_grids、nonexist_raster_grids、PNR1、PNR2)
    save_ndarray_to_mat(exist_raster_grids, 'exist_raster_grids',
                        graph_results_dir, graph_name, emb_method_name1,
                        emb_method_name2, binNum)
    save_ndarray_to_mat(nonexist_raster_grids, 'nonexist_raster_grids',
                        graph_results_dir, graph_name, emb_method_name1,
                        emb_method_name2, binNum)
    save_ndarray_to_mat(PNR1, 'PNR1', graph_results_dir, graph_name,
                        emb_method_name1, emb_method_name2, binNum)
    save_ndarray_to_mat(PNR2, 'PNR2', graph_results_dir, graph_name,
                        emb_method_name1, emb_method_name2, binNum)

    # PNR调整分数(只调整non-existing link的部分)
    nonexist_scores_PNR_list = transfer_scores_PNR(
        scores_matrix_one_norm=scores_matrix_one_norm,
        scores_matrix_two_norm=scores_matrix_two_norm,
        train_binary=train_binary,
        PNR=PNR2,
        interval=interval,
        binNum=binNum)

    # weighted hybird方法的分数,0.5均权直接相加
    scores_matrix_hybrid_norm = 0.5 * scores_matrix_one_norm + 0.5 * scores_matrix_two_norm
    nonexist_scores_hybrid_list = (np.array(
        scores_matrix_hybrid_norm[nonexist_binary > 0], dtype=float))[0]

    # 评估evaluation
    graph_test_path = get_testset_path(base_dir=all_file_dir,
                                       graph_name=graph_name)
    test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=N)
    L_full = int(np.sum(test_binary))
    L_array = np.array([
        int(L_full / 20),
        int(L_full / 10),
        int(L_full / 5),
        int(L_full / 2), L_full
    ])

    del scores_matrix_one_norm, scores_matrix_two_norm, exist_scores_one_list, exist_scores_two_list, scores_matrix_hybrid_norm
    gc.collect()


    AP_PNR, AUC_PNR, Precision_PNR, Recall_PNR, F1score_PNR=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_PNR_list,
                   L_array=L_array)
    AP_method1, AUC_method1, Precision_method1, Recall_method1, F1score_method1=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_one_list,
                   L_array=L_array)
    AP_method2, AUC_method2, Precision_method2, Recall_method2, F1score_method2=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_two_list,
                   L_array=L_array)
    AP_weighted, AUC_weighted, Precision_weighted, Recall_weighted, F1score_weighted=\
        evaluators(train_binary=train_binary,
                   test_binary=test_binary,
                   scores_list=nonexist_scores_hybrid_list,
                   L_array=L_array)

    print('AP_PNR:  ' + str(AP_PNR))
    print('AP_method1:  ' + str(AP_method1))
    print('AP_method2:  ' + str(AP_method2))
    print('AP_weighted:  ' + str(AP_weighted))
    print('\n')
    print('AUC_PNR:  ' + str(AUC_PNR))
    print('AUC_method1:  ' + str(AUC_method1))
    print('AUC_method2:  ' + str(AUC_method2))
    print('AUC_weighted:  ' + str(AUC_weighted))
    print('\n')
    print('Precision_PNR:  ' + str(Precision_PNR))
    print('Precision_method1:  ' + str(Precision_method1))
    print('Precision_method2:  ' + str(Precision_method2))
    print('Precision_weighted:  ' + str(Precision_weighted))
    print('\n')
    print('Recall_PNR:  ' + str(Recall_PNR))
    print('Recall_method1:  ' + str(Recall_method1))
    print('Recall_method2:  ' + str(Recall_method2))
    print('Recall_weighted:  ' + str(Recall_weighted))
    print('\n')
    print('F1score_PNR:  ' + str(F1score_PNR))
    print('F1score_method1:  ' + str(F1score_method1))
    print('F1score_method2:  ' + str(F1score_method2))
    print('F1score_weighted:  ' + str(F1score_weighted))
    print('\n')

    write_to_excel(graph_name, emb_method_name1, emb_method_name2,
                   Precision_PNR, Precision_method1, Precision_method2,
                   Precision_weighted, Recall_PNR, Recall_method1,
                   Recall_method2, Recall_weighted, F1score_PNR,
                   F1score_method1, F1score_method2, F1score_weighted, AP_PNR,
                   AP_method1, AP_method2, AP_weighted, AUC_PNR, AUC_method1,
                   AUC_method2, AUC_weighted)

    time_end = time.time()
    print("time span:  " + str((time_end - time_start) / 60.00) + "  mins")
    # facebook_combined:bin=5, 1.5分钟
    # facebook_combined:cn和pearson\aa和cn花了3.5分钟
    # facebook_combined:graphdistance和cn花了11分钟
    # facebook_combined: graphdistance和cn的PNE矩阵为全0
    # facebooke_combined: attentionwalk和prone花了7.5分钟
    # facebooke_combined: 有rootedpagerank的效果都很差;
    # arope比PNR好一点,SDNE和PRUE很差很差;drne和graph2gauss也是极差的但是PNR融合后表现极好;

    # blogcatalog:aa和ja花了3小时
    # (path based--katz和graphdistance都十分慢,neighbor based和rank based很快)

    # google 15000 nodes: 2.5小时
    print(
        '--------------------------------------------------------------------------------'
    )
    pass
Exemplo n.º 20
0
def auto_DNN(prex=None,
             graph_name=None,
             emb_method_name1=None,
             emb_method_name2=None,
             model_name=None,
             DNN_binNum=None):
    print('----------------------------------------------------------')
    print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 +
          "," + emb_method_name2)

    results_base_dir = 'D:\hybridrec//results//'
    all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex
    results_dir = 'D:\hybridrec/results//' + prex
    graph_results_dir = results_dir + graph_name + '//'
    # (facebook_combined的规律:ratio越小则正负样本的预测准确率越高,花的时间也越少)
    ratio = 1  # 负样本的总数是正样 本的ratio倍  # 改这里

    path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat"
    path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat"

    # Initialize the model,改这里

    # hidden_layer_sizes=(10, 20, 10):三个隐藏层,分别10、20、10个神经元
    if model_name == "mlp":
        model = MLPClassifier(hidden_layer_sizes=(10, 20),
                              activation='relu',
                              solver='adam',
                              max_iter=200,
                              alpha=0.01,
                              batch_size=256,
                              learning_rate='constant',
                              learning_rate_init=0.001,
                              shuffle=False,
                              random_state=2020,
                              early_stopping=True,
                              validation_fraction=0.2,
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=1e-08,
                              n_iter_no_change=10)
    pass

    if model_name == "svm":
        model = SVC(C=5, random_state=42)  # 出问题了
    pass

    if model_name == "lr":
        model = LogisticRegression(C=5,
                                   penalty='l1',
                                   tol=1e-6,
                                   random_state=42)  # penalty 有l1和l2
    pass

    if model_name == "lgbm":
        model = LGBMClassifier(num_leaves=31,
                               learning_rate=0.1,
                               n_estimators=64,
                               random_state=42,
                               n_jobs=-1)
    pass

    if model_name == "xgb":
        model = XGBClassifier(max_depth=5,
                              learning_rate=0.1,
                              n_jobs=-1,
                              nthread=-1,
                              gamma=0.06,
                              min_child_weight=5,
                              subsample=1,
                              colsample_bytree=0.9,
                              reg_alpha=0,
                              reg_lambda=0.5,
                              random_state=42)
    pass

    if model_name == "ld":
        model = LinearDiscriminantAnalysis(solver='lsqr')
    pass

    if model_name == "rf":
        model = RandomForestClassifier(n_estimators=50,
                                       max_depth=20,
                                       min_samples_split=2,
                                       min_samples_leaf=5,
                                       max_features="log2",
                                       random_state=12)
    pass

    if not (os.path.exists(path_scores_method1)
            and os.path.exists(path_scores_method2)):
        print("dataset: " + graph_name + '----' + "baselines:" +
              emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算')

    if os.path.exists(path_scores_method1) and os.path.exists(
            path_scores_method2):
        # 获取归一化分数
        scores_matrix_one_dict = (loadmat(path_scores_method1))
        scores_matrix_two_dict = (loadmat(path_scores_method2))
        scores_matrix_one = scores_matrix_one_dict['scores']
        scores_matrix_two = scores_matrix_two_dict['scores']
        if emb_method_name1 not in all_embedding_methods:
            scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A,
                                                   k=1))  # k=1表示不包括对角线
        if emb_method_name2 not in all_embedding_methods:
            scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1))
        scores_matrix_one_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_one))
        scores_matrix_two_norm = normalize_matrix(
            csr_matrix1=csr_matrix(scores_matrix_two))

        # 获取train_binary和test_binary
        graph_train_path = get_trainset_path(base_dir=all_file_dir,
                                             graph_name=graph_name,
                                             connected_pattern='undirected',
                                             from_zeros_one='0')
        graph_test_path = get_testset_path(base_dir=all_file_dir,
                                           graph_name=graph_name)
        G = read_graph(weighted=0, input=graph_train_path, directed=0)
        train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G))
        train_binary = csr_matrix(np.triu(train_binary.A, k=1))
        test_binary = get_test_matrix_binary(graph_test_path=graph_test_path,
                                             N=train_binary.shape[0])

        del scores_matrix_one, scores_matrix_two
        gc.collect()

        # 获取正样本的分数
        exist_binary = csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
        exist_scores_one_list = (np.array(
            scores_matrix_one_norm[exist_binary > 0], dtype=float))[0]
        exist_scores_two_list = (np.array(
            scores_matrix_two_norm[exist_binary > 0], dtype=float))[0]

        # 构建测试样本(正样本+负样本)
        X_train_1 = (np.array([exist_scores_one_list,
                               exist_scores_two_list])).T
        X_train_0 = negative_samples(
            train_binary=train_binary,
            test_binary=test_binary,
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm,
            ratio=ratio)
        Y_train_1 = np.random.randint(1, 2, X_train_1.shape[0])
        Y_train_0 = np.random.randint(0, 1, X_train_0.shape[0])
        X_train = np.vstack((np.array(X_train_1), np.array(X_train_0)))
        Y_train = (np.hstack((np.array(Y_train_1), np.array(Y_train_0)))).T

        time_start = time.time()

        # 模型训练
        model.fit(X_train, Y_train)

        # 模型预测
        preds_0 = model.predict(X_train_0)
        preds_1 = model.predict(X_train_1)
        print(np.sum(preds_0))
        print(np.sum(preds_1))
        preds_0_proba = model.predict_proba(X_train_0)
        preds_1_proba = model.predict_proba(X_train_1)

        # 模型预测
        scores_matrix_DNN = predicted_scores_DNN(
            model=model,
            train_binary=train_binary,
            test_binary=test_binary,
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm)
        save_DNN_hybrid_scores(scores_matrix_DNN=scores_matrix_DNN,
                               method1=emb_method_name1,
                               method2=emb_method_name2,
                               graph_results_dir=graph_results_dir,
                               dataset_name=graph_name,
                               model_name=model_name)
        scores_matrix_DNN_norm = normalize_matrix(
            csr_matrix1=scores_matrix_DNN)

        # 计算DNN的rasterization grids
        DNN_raster_grids = rasterization_grids(
            binNum=DNN_binNum,
            train_binary=train_binary,
            scores_matrix_DNN=scores_matrix_DNN_norm,
            scores_matrix_one_norm=scores_matrix_one_norm,
            scores_matrix_two_norm=scores_matrix_two_norm)
        # DNN_raster_grids = np.log10(DNN_raster_grids) # 出现-inf而报错
        DNN_raster_grids = normalize_matrix_full(
            csr_matrix1=csr_matrix(DNN_raster_grids))
        DNN_raster_grids = better_show_grids(csr_matrix1=DNN_raster_grids)
        save_DNN_raster_scores(rastser_grids=DNN_raster_grids,
                               method1=emb_method_name1,
                               method2=emb_method_name2,
                               graph_results_dir=graph_results_dir,
                               dataset_name=graph_name,
                               model_name=model_name,
                               DNN_binNum=DNN_binNum)
        source = np.float32(DNN_raster_grids.A)
        result = cv2.GaussianBlur(source, (5, 5), 0)
        title = graph_name + '-' + model_name + '-' + emb_method_name1 + '-' + emb_method_name2
        plot_contourf(result=result, title=title, binNum=10)

        # 读取PNR grids
        PNR_path = results_base_dir + prex + graph_name + "//" + "PNR1_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat"
        if is_excel_file_exist(PNR_path):
            PNR_dict = (loadmat(PNR_path))
            PNR_matrix = PNR_dict["count"]
            PNR_matrix = better_show_grids(csr_matrix1=PNR_matrix)
            source = np.float32(PNR_matrix.A)
            result = cv2.GaussianBlur(source, (5, 5),
                                      0)  #(5, 5)表示高斯矩阵的长与宽都是5,标准差取0
            title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2
            plot_contourf(result=result, title=title, binNum=10)

        # 评估DNN
        exist_binary = csr_matrix(np.triu(train_binary.A, k=1))  # k=1表示不包括对角线
        nonexist_binary = csr_matrix(
            np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A)
        nonexist_scores_DNN_list = (np.array(
            scores_matrix_DNN[nonexist_binary > 0], dtype=float))[0]
        L_full = int(np.sum(test_binary))
        L_array = np.array([
            int(L_full / 20),
            int(L_full / 10),
            int(L_full / 5),
            int(L_full / 2), L_full
        ])
        AP_DNN, AUC_DNN, Precision_DNN, Recall_DNN, F1score_DNN = \
            evaluators(train_binary=train_binary,
                       test_binary=test_binary,
                       scores_list=nonexist_scores_DNN_list,
                       L_array=L_array)
        # print('AP_DNN:  ' + str(AP_DNN))
        # print('\n')
        # print('AUC_DNN:  ' + str(AUC_DNN))
        # print('\n')
        # print('Precision_DNN:  ' + str(Precision_DNN))
        # print('\n')
        # print('Recall_DNN:  ' + str(Recall_DNN))
        # print('\n')
        # print('F1score_DNN:  ' + str(F1score_DNN))
        # print('\n')

        # 把precision、recall、F1score、AP写入excel文件
        DNN_write_to_excel(DL_name=model_name,
                           dataset_name=graph_name,
                           method1=emb_method_name1,
                           method2=emb_method_name2,
                           precision_DL=Precision_DNN,
                           recall_DL=Recall_DNN,
                           F1score_DL=F1score_DNN,
                           AP_DL=AP_DNN)

        time_end = time.time()
        print("It takes : " + str((time_end - time_start) / 60.0) + "  mins.")
        pass
Exemplo n.º 21
0
def stanford_glove():
    """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :embedding_dim          - size of embeddings
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
    """
    if verbose > 0:
        print_header_str('STANFORD GLOVE')

    if (reuse_computed and
            os.path.isfile(embeddings_dir + selected_embeddings_file + '.npy')
            and os.path.isfile(vocab_dir + vocab_file + '.pkl')):
        if verbose > 0:
            print('Reusing word2vec vocab:', vocab_file)
            print('Reusing word2vec embeddings:', selected_embeddings_file)
            print_header_str('DONE')
            print()
        return

    dataset = []

    stanford_root_dir = embeddings_dir + '../StanfordGloVe/'

    with open(stanford_root_dir + 'run.sh', 'w') as frun:
        frun.write(f"""\
#!/bin/bash
set -e

pushd {stanford_root_dir}
make
popd

# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
# One optional argument can specify the language used for eval script: matlab, octave or [default] python

CORPUS="{tweet_dir+emb_train_tweets_pos} {tweet_dir+emb_train_tweets_neg} {tweet_dir+emb_test_tweets}"
VOCAB_FILE={stanford_root_dir+vocab_file}_cnt.txt
COOCCURRENCE_FILE={stanford_root_dir}cooccurrence.bin
COOCCURRENCE_SHUF_FILE={stanford_root_dir}cooccurrence.shuf.bin
BUILDDIR={stanford_root_dir}build
SAVE_FILE={stanford_root_dir+selected_embeddings_file}_tmp
VERBOSE=2
MEMORY=8.0
VOCAB_MIN_COUNT={emb_word_min_count}
VECTOR_SIZE={embedding_dim}
MAX_ITER={embedding_epochs}
WINDOW_SIZE={emb_context_window}
BINARY=2
NUM_THREADS=6
X_MAX=100

echo
echo "$ cat CORPUS | BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > VOCAB_FILE"
cat $CORPUS | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > $VOCAB_FILE

echo "$ cat CORPUS | BUILDDIR/cooccur -memory $MEMORY -vocab-file VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > COOCCURRENCE_FILE"
cat $CORPUS | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > $COOCCURRENCE_FILE

echo "$ BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < COOCCURRENCE_FILE > COOCCURRENCE_SHUF_FILE"
$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE

echo "$ BUILDDIR/glove -save-file SAVE_FILE -threads $NUM_THREADS -input-file COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file VOCAB_FILE -verbose $VERBOSE"
$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE

rm $COOCCURRENCE_FILE $COOCCURRENCE_SHUF_FILE
        """)

    stanford_glove_cmd = 'chmod +x ' + stanford_root_dir + 'run.sh && '
    stanford_glove_cmd += stanford_root_dir + 'run.sh'
    run_script(stanford_glove_cmd)

    vocab_size = sum(1 for line in open(
        stanford_root_dir + selected_embeddings_file + '_tmp.txt', 'r'))
    vocab = {}
    embeddings = np.zeros((vocab_size, embedding_dim), dtype='float32')

    with open(stanford_root_dir + selected_embeddings_file + '_tmp.txt',
              'r') as f:
        for i, l in enumerate(f):
            ll = l.strip().split(' ')
            word, emb = ll[0].strip(), [float(x.strip()) for x in ll[1:]]

            vocab[word] = i
            embeddings[i] = np.array(emb)

    if embedding_norm:
        embeddings = normalize_matrix(embeddings)

    np.save(embeddings_dir + selected_embeddings_file, embeddings)

    with open(vocab_dir + vocab_file + '.pkl', 'wb') as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

    cleanup_cmd = f'rm {stanford_root_dir+vocab_file}_cnt.txt {stanford_root_dir+selected_embeddings_file}_tmp.txt ; rm -rf {stanford_root_dir}build'
    run_script(cleanup_cmd)

    if verbose > 0:
        print('Vocabulary size:', len(vocab))
        print_header_str('DONE')
    print()
Exemplo n.º 22
0
    def evaluation_on_BLI(self, verbose=0):
        """ Start Evaluation on given Test translation dictionary.
        Args:
            verbose: Set to 1, to see top 3 predictions of the first 5 words.

        Returns:

        """
        ranking = []
        iteration = 0
        norm_proj_src_emb = normalize_matrix(
            self.CrossLingualModel.proj_embedding_source_target)
        for test_src_word, test_trg_word in zip(self.test_translation_source,
                                                self.test_translation_target):

            source_index = self.CrossLingualModel.src_word2ind[
                test_src_word] if test_src_word in self.CrossLingualModel.src_word2ind.keys(
                ) else -1
            target_index = self.CrossLingualModel.trg_word2ind[
                test_trg_word] if test_trg_word in self.CrossLingualModel.trg_word2ind.keys(
                ) else -1
            if source_index == -1 or target_index == -1:
                continue

            # Calculate Cos Similarity
            norm_proj_src_word_emb = norm_proj_src_emb[[source_index]]
            similarity_cos = np.dot(
                norm_proj_src_word_emb,
                np.transpose(self.CrossLingualModel.norm_trg_embedding_matrix))
            # Find Closest Neighbors
            most_similar_trg_index = np.argsort(-similarity_cos[[0]])
            find_rank = np.where(
                most_similar_trg_index == target_index)[1][0] + 1
            ranking.append(find_rank)

            if iteration <= 5 and verbose:
                print("\nTest translation: {} -> {}".format(
                    test_src_word,
                    self.CrossLingualModel.trg_ind2word[target_index]))
                print("Predicted Top 3 Translations: {}, {}, {}".format(
                    self.CrossLingualModel.trg_ind2word[most_similar_trg_index[
                        0, 0]], self.CrossLingualModel.trg_ind2word[
                            most_similar_trg_index[0, 1]],
                    self.CrossLingualModel.trg_ind2word[most_similar_trg_index[
                        0, 2]]))
            iteration += 1

        if len(ranking) == 0:
            print("NO MATCHING FOUND!")
        else:
            print("\n\nNumber of Test Translations: {}/{}".format(
                len(ranking), len(self.test_translation_source)))
            p1 = len([p for p in ranking if p <= 1]) / len(ranking)
            p5 = len([p for p in ranking if p <= 5]) / len(ranking)
            p10 = len([p for p in ranking if p <= 10]) / len(ranking)
            print("P@1: {}".format(p1))
            print("P@5: {}".format(p5))
            print("P@10: {}".format(p10))

            mrr = sum([1.0 / p for p in ranking]) / len(ranking)
            print("\n\nMRR: {}".format(mrr))
Exemplo n.º 23
0
	def normalize_matrix(self):
		utils.normalize_matrix(self.matrix)