예제 #1
0
파일: lda.py 프로젝트: ykpku/LDA-Reg-TKDE
 def __read_phi_alpha_theta_byname(self, name):
     plsa = ""
     percent = ""
     if self.PLSA:
         plsa = "_PLSA"
     if self.corpus_percent != 1.0:
         percent = "_" + str(self.corpus_percent) + "percent"
     alpha = CsvUtility.read_array_from_csv(
         self.doc_path,
         name + 'alpha_' + str(self.topic_num) + plsa + percent + '.csv')
     phi = CsvUtility.read_array_from_csv(
         self.doc_path,
         name + 'phi_' + str(self.topic_num) + plsa + percent + '.csv')
     # theta = CsvUtility.read_array_from_csv(self.output_path, name+'theta_' + str(self.topic_num) + plsa + percent + '.csv')
     return alpha, phi
예제 #2
0
def get_some_instance(file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, num=10):
    train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz')
    train_x = train_x.reshape((train_x.shape[0], seq_num, -1))
    valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz')
    valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1))
    test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz')
    test_x = test_x.reshape((test_x.shape[0], seq_num, -1))
    train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv')
    valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv')
    test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv')

    x_data = np.concatenate((train_x, valid_x, test_x), axis=0)
    y_data = np.concatenate((train_y, valid_y, test_y), axis=0)

    idx = np.random.permutation(x_data.shape[0])
    x_data = x_data[idx]
    y_data = y_data[idx]
    return x_data[:num], y_data[:num]
예제 #3
0
def reload_mimic_seq(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num):
    train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz')
    train_x = train_x.reshape((train_x.shape[0], seq_num, -1))
    valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz')
    valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1))
    test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz')
    test_x = test_x.reshape((test_x.shape[0], seq_num, -1))
    train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv')
    valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv')
    test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv')
    if valid:
        test_x = valid_x
        test_y = valid_y
    else:
        train_x = np.concatenate((train_x, valid_x), axis=0)
        train_y = np.concatenate((train_y, valid_y), axis=0)
    if train_percent < 0.8:
        new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent)
        train_x = train_x[:new_training_size]
        train_y = train_y[:new_training_size]
    return train_x, train_y, test_x, test_y
예제 #4
0
def reload_mimic_embedding(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, embedding_type=EMBEDP.embedding_type, veclen=EMBEDP.veclen, window=EMBEDP.window):
    if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns':
        embedding_name = 'lda_sgns' + str(veclen) + '_window' + str(window)
    else:
        embedding_name = embedding_type + str(veclen) + '_window' + str(window)
    train_x = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv')
    train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq_' + embedding_name + '.csv')
    valid_x = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv')
    valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq_' + embedding_name + '.csv')
    test_x = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv')
    test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq_' + embedding_name + '.csv')
    train_x = train_x.reshape((train_x.shape[0], seq_num, -1))
    valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1))
    test_x = test_x.reshape((test_x.shape[0], seq_num, -1))

    if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns':
        embedding_name = 'embedding_skipgram' + str(veclen) + '_window' + str(window)
        train_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv')
        valid_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv')
        test_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv')
        train_x_sg = train_x_sg.reshape((train_x_sg.shape[0], seq_num, -1))
        valid_x_sg = valid_x_sg.reshape((valid_x_sg.shape[0], seq_num, -1))
        test_x_sg = test_x_sg.reshape((test_x_sg.shape[0], seq_num, -1))

        if embedding_type == 'sg_add_sgns':
            train_x = train_x + train_x_sg
            valid_x = valid_x + valid_x_sg
            test_x = test_x + test_x_sg
        if embedding_type == 'sg_cancat_sgns':
            train_x = np.concatenate((train_x, train_x_sg), axis=2)
            valid_x = np.concatenate((valid_x, valid_x_sg), axis=2)
            test_x = np.concatenate((test_x, test_x_sg), axis=2)

    if valid:
        test_x = valid_x
        test_y = valid_y
    else:
        train_x = np.concatenate((train_x, valid_x), axis=0)
        train_y = np.concatenate((train_y, valid_y), axis=0)
    if train_percent < 0.8:
        new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent)
        train_x = train_x[:new_training_size]
        train_y = train_y[:new_training_size]
    return train_x, train_y, test_x, test_y
예제 #5
0
파일: lda.py 프로젝트: ykpku/LDA-Reg-TKDE
    def read_topic_distrib(self):
        if self.mimic_movie_wiki == 0:
            name = "MIMIC"
        elif self.mimic_movie_wiki == 1:
            name = "MovieReview"
        else:
            name = "Wiki"
        plsa = ""

        if self.PLSA:
            plsa = "_PLSA"
        percent = ""
        if self.corpus_percent != 1.0:
            percent = "_" + str(self.corpus_percent) + "percent"
        topic_distrib = CsvUtility.read_array_from_csv(
            self.output_path, name + '_topic_distrib_' + str(self.topic_num) +
            plsa + percent + '.csv')
        return topic_distrib
예제 #6
0
def cluster_neurons(
        neuron_labels,
        base_path="/home1/yk/experiments_TKDE/major_revision/",
        sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv",
        cluster_num=10):
    sita = CsvUtility.read_array_from_csv(base_path, sita_file)
    # print(sita[:3])
    # print(sita.shape)
    sc = SpectralClustering(cluster_num,
                            assign_labels='discretize',
                            random_state=random.randint(0, 10))
    sc.fit(sita)
    # print(sc.labels_)
    label_cluster_matrix = np.zeros((80, cluster_num))
    for i, cluster in enumerate(sc.labels_):
        neuron_i_labels = neuron_labels[i]
        for nil in neuron_i_labels:
            label_cluster_matrix[nil][cluster] += 1
    return label_cluster_matrix, sc.labels_
예제 #7
0
def cal_common_label_ratio(
        neuron_labels,
        base_path="/home1/yk/experiments_TKDE/major_revision/",
        sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv"
):
    sita = CsvUtility.read_array_from_csv(base_path, sita_file)
    sita_cos_sim = cosine_similarity(sita)
    consistent_neurons_num = 0
    for neuron_i, neuron_sims in enumerate(sita_cos_sim):
        top_k, top_v = get_list_sort_index(neuron_sims, 4)
        top_k_label_count = {}
        for top_k_i in top_k:
            k_i_labels = neuron_labels[top_k_i]
            for l in k_i_labels:
                top_k_label_count[l] = top_k_label_count.setdefault(l, 0) + 1
        for label_count_val in top_k_label_count.values():
            if label_count_val >= 3:
                consistent_neurons_num += 1
                break
    return consistent_neurons_num, consistent_neurons_num * 1.0 / len(
        neuron_labels)
예제 #8
0
def generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_valid_perc=0.8, shuffle=False, save=False):

    feature_tensor, feature_count_tensor, result_tensor = get_mimic_sequence_data(data_pickle_path=base_path+'after_instance_seq.pkl',
                                                            word_dict_path=base_path+'event_instance_dict_seq.pkl',
                                                            predict_dict_path=base_path+'predict_diags_dict_seq.pkl',
                                                            seq_max=seq_max,
                                                            vec_len=vec_len,
                                                            sgns_path=sgns_path,
                                                            save_path=save_path,
                                                            save=False)
    feature_tensor = __get_aggregate_seq(feature_tensor, feature_count_tensor, seq_not)
    x = feature_tensor.reshape((feature_tensor.shape[0], -1))
    y = result_tensor.reshape((result_tensor.shape[0], -1))
    train_size = int(x.shape[0] * train_valid_perc)
    # for further extention
    name_append = 'SGNS'
    # shuffle the train set
    if shuffle:
        idx = np.random.permutation(x.shape[0])
        CsvUtility.write_array2csv(idx, base_path, 'random_idx_seq_' + name_append + '.csv')
    else:
        idx = CsvUtility.read_array_from_csv(base_path, 'random_idx_seq_' + name_append + '.csv')
    x_train = x[idx]
    y_train = y[idx]

    training_x = x_train[:train_size]
    training_y = y_train[:train_size]
    testing_x = x_train[train_size:]
    testing_y = y_train[train_size:]
    # print training_x.shape
    # print training_y.shape
    # print testing_x.shape
    # print testing_y.shape
    # print len(idx)
    if save:
        CsvUtility.write_array2csv(training_x, save_path, 'formal_train_valid_x_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(training_y, save_path, 'formal_train_valid_y_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_'+name_append+'.csv')
    return training_x, training_y, testing_x, testing_y