示例#1
0
def get_corpus_contend_thread(process_index,
                              file_list,
                              word2index,
                              write_path="/home1/yk/wikipedia_dataset/filter",
                              word_kind_limit=50,
                              remove_stopwords=False,
                              stem_words=True,
                              remove_html=True):

    corpus_contend = []
    for file_iter, file_name in enumerate(file_list):
        tem_data = get_real_word_list(file_name, word2index, word_kind_limit,
                                      remove_stopwords, stem_words,
                                      remove_html)
        # print(file_name, 'read ready~', len(tem_data))

        corpus_contend.extend(tem_data)
        if (file_iter + 1) % 10 == 0:
            print((file_iter + 1), 'file done.')
            if (file_iter + 1) % 100 == 0:
                name = process_index + "process_" + str(file_iter +
                                                        1) + "iter_text.csv"
                CsvUtility.write_norm_array2csv(corpus_contend, write_path,
                                                name)
                corpus_contend = []

    print(process_index, 'finish~')
    return corpus_contend
示例#2
0
文件: lda.py 项目: ykpku/LDA-Reg-TKDE
 def __read_phi_alpha_theta_byname(self, name):
     plsa = ""
     percent = ""
     if self.PLSA:
         plsa = "_PLSA"
     if self.corpus_percent != 1.0:
         percent = "_" + str(self.corpus_percent) + "percent"
     alpha = CsvUtility.read_array_from_csv(
         self.doc_path,
         name + 'alpha_' + str(self.topic_num) + plsa + percent + '.csv')
     phi = CsvUtility.read_array_from_csv(
         self.doc_path,
         name + 'phi_' + str(self.topic_num) + plsa + percent + '.csv')
     # theta = CsvUtility.read_array_from_csv(self.output_path, name+'theta_' + str(self.topic_num) + plsa + percent + '.csv')
     return alpha, phi
示例#3
0
def get_filter_data(path):
    get_con = []
    for process_index in range(1):
        for file_iter in range(6):
            name = str(process_index) + "process_" + str(file_iter +
                                                         1) + "00iter_text.csv"
            content = CsvUtility.read_norm_array_csv(path, name)
            # print(len(content))
            get_con.extend(content)
    print(" content number : ", len(get_con))
    return get_con[:100000]
示例#4
0
文件: lda.py 项目: ykpku/LDA-Reg-TKDE
 def __change_Movie_word_index__(self, gamma, word2index_pickle):
     feature_word2id = CsvUtility.read_pickle(word2index_pickle, 'r')
     print 'feature size: ', len(feature_word2id)
     change_index_result = np.zeros((gamma.shape[0], len(feature_word2id)))
     for j in range(gamma.shape[1]):
         new_index = feature_word2id[self.dictionary.__getitem__(j)]
         for i in range(gamma.shape[0]):
             change_index_result[i][new_index] += gamma[i][j]
         if j % 1000 == 0:
             print j, 'line'
     print 'after changing the size of result: ', change_index_result.shape
     return change_index_result
示例#5
0
def reload_mimic_embedding(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, embedding_type=EMBEDP.embedding_type, veclen=EMBEDP.veclen, window=EMBEDP.window):
    if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns':
        embedding_name = 'lda_sgns' + str(veclen) + '_window' + str(window)
    else:
        embedding_name = embedding_type + str(veclen) + '_window' + str(window)
    train_x = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv')
    train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq_' + embedding_name + '.csv')
    valid_x = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv')
    valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq_' + embedding_name + '.csv')
    test_x = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv')
    test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq_' + embedding_name + '.csv')
    train_x = train_x.reshape((train_x.shape[0], seq_num, -1))
    valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1))
    test_x = test_x.reshape((test_x.shape[0], seq_num, -1))

    if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns':
        embedding_name = 'embedding_skipgram' + str(veclen) + '_window' + str(window)
        train_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv')
        valid_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv')
        test_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv')
        train_x_sg = train_x_sg.reshape((train_x_sg.shape[0], seq_num, -1))
        valid_x_sg = valid_x_sg.reshape((valid_x_sg.shape[0], seq_num, -1))
        test_x_sg = test_x_sg.reshape((test_x_sg.shape[0], seq_num, -1))

        if embedding_type == 'sg_add_sgns':
            train_x = train_x + train_x_sg
            valid_x = valid_x + valid_x_sg
            test_x = test_x + test_x_sg
        if embedding_type == 'sg_cancat_sgns':
            train_x = np.concatenate((train_x, train_x_sg), axis=2)
            valid_x = np.concatenate((valid_x, valid_x_sg), axis=2)
            test_x = np.concatenate((test_x, test_x_sg), axis=2)

    if valid:
        test_x = valid_x
        test_y = valid_y
    else:
        train_x = np.concatenate((train_x, valid_x), axis=0)
        train_y = np.concatenate((train_y, valid_y), axis=0)
    if train_percent < 0.8:
        new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent)
        train_x = train_x[:new_training_size]
        train_y = train_y[:new_training_size]
    return train_x, train_y, test_x, test_y
示例#6
0
def save_results(net, sita, result_epoch, time_epochs, run_p):
    save_path = run_p.save_path
    save_name = run_p.save_name

    time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S",
                                    time.localtime()) + '#_'

    used_params = [run_p]
    if run_p.mimic0_movie1_wiki2 == 0:
        used_params.append(MIMICP)
    else:
        used_params.append(MOVIEP)
    if run_p.onehot0_embedding != 0:
        used_params.append(EMBEDP)
    if run_p.lm_lda_l2 == 0:
        used_params.append(LSTMP)
        used_params.append(LDAP)
        used_params.append(ldaregP)
    elif run_p.lm_lda_l2 == 1:
        used_params.append(LSTMP)
    elif run_p.lm_lda_l2 == 2:
        used_params.append(MLPP)
        used_params.append(LDAP)
        used_params.append(ldaregP)
    else:
        used_params.append(MLPP)
    for param_item in used_params:
        param_item.save_self(save_path,
                             time_code + 'params_' + save_name + '.csv')

    # save net
    torch.save(net, save_path + time_code + 'lstm_model_' + save_name + '.pkl')

    # save topic distribution
    if sita.ndim > 1:
        CsvUtility.write_array2csv(sita, save_path,
                                   time_code + 'sita_' + save_name + '.csv')

    # save results
    metric_result, aucs = __split_metrics(result_epoch)
    CsvUtility.write_list2csv(metric_result, save_path,
                              time_code + 'metrics_' + save_name + '.csv')
    if len(aucs) > 0:
        CsvUtility.write_array2csv(aucs, save_path,
                                   time_code + 'aucs_' + save_name + '.csv')

    # save time consuming
    CsvUtility.write_array2csv(time_epochs, save_path,
                               time_code + 'time_epochs_' + save_name + '.csv')
示例#7
0
def _load_and_process_metadata(sentence_dir,
                               movie_review_path,
                               num_processor=8):

    # Extract the filenames.
    sentence_filenames = glob.glob(os.path.join(sentence_dir, "*/*"))
    print(len(sentence_filenames))
    # print(sentence_filenames[-10:])
    # sentence_filenames = sentence_filenames[-20:]

    word2index = CsvUtility.read_pickle(
        movie_review_path + '/new_word2index.pkl', 'r')
    index2word = CsvUtility.read_pickle(
        movie_review_path + '/new_index2word.pkl', 'r')

    # Break the files into num_threads batches.
    spacing = np.linspace(0, len(sentence_filenames),
                          num_processor + 1).astype(np.int)
    ranges = []
    for i in xrange(len(spacing) - 1):
        ranges.append([spacing[i], spacing[i + 1]])

    p = Pool(num_processor)
    res = []
    for i in range(num_processor):
        start = ranges[i][0]
        end = ranges[i][1]
        res.append(
            p.apply_async(get_corpus_contend_thread,
                          args=(str(i), sentence_filenames[start:end],
                                word2index)))
        print(str(i) + ' processor started !')

    # get_corpus_contend(sentence_filenames, word2index)
    p.close()
    p.join()
示例#8
0
文件: lda.py 项目: ykpku/LDA-Reg-TKDE
    def read_topic_distrib(self):
        if self.mimic_movie_wiki == 0:
            name = "MIMIC"
        elif self.mimic_movie_wiki == 1:
            name = "MovieReview"
        else:
            name = "Wiki"
        plsa = ""

        if self.PLSA:
            plsa = "_PLSA"
        percent = ""
        if self.corpus_percent != 1.0:
            percent = "_" + str(self.corpus_percent) + "percent"
        topic_distrib = CsvUtility.read_array_from_csv(
            self.output_path, name + '_topic_distrib_' + str(self.topic_num) +
            plsa + percent + '.csv')
        return topic_distrib
示例#9
0
def cluster_neurons(
        neuron_labels,
        base_path="/home1/yk/experiments_TKDE/major_revision/",
        sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv",
        cluster_num=10):
    sita = CsvUtility.read_array_from_csv(base_path, sita_file)
    # print(sita[:3])
    # print(sita.shape)
    sc = SpectralClustering(cluster_num,
                            assign_labels='discretize',
                            random_state=random.randint(0, 10))
    sc.fit(sita)
    # print(sc.labels_)
    label_cluster_matrix = np.zeros((80, cluster_num))
    for i, cluster in enumerate(sc.labels_):
        neuron_i_labels = neuron_labels[i]
        for nil in neuron_i_labels:
            label_cluster_matrix[nil][cluster] += 1
    return label_cluster_matrix, sc.labels_
示例#10
0
def cal_common_label_ratio(
        neuron_labels,
        base_path="/home1/yk/experiments_TKDE/major_revision/",
        sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv"
):
    sita = CsvUtility.read_array_from_csv(base_path, sita_file)
    sita_cos_sim = cosine_similarity(sita)
    consistent_neurons_num = 0
    for neuron_i, neuron_sims in enumerate(sita_cos_sim):
        top_k, top_v = get_list_sort_index(neuron_sims, 4)
        top_k_label_count = {}
        for top_k_i in top_k:
            k_i_labels = neuron_labels[top_k_i]
            for l in k_i_labels:
                top_k_label_count[l] = top_k_label_count.setdefault(l, 0) + 1
        for label_count_val in top_k_label_count.values():
            if label_count_val >= 3:
                consistent_neurons_num += 1
                break
    return consistent_neurons_num, consistent_neurons_num * 1.0 / len(
        neuron_labels)
示例#11
0
def get_some_instance(file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, num=10):
    train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz')
    train_x = train_x.reshape((train_x.shape[0], seq_num, -1))
    valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz')
    valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1))
    test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz')
    test_x = test_x.reshape((test_x.shape[0], seq_num, -1))
    train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv')
    valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv')
    test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv')

    x_data = np.concatenate((train_x, valid_x, test_x), axis=0)
    y_data = np.concatenate((train_y, valid_y, test_y), axis=0)

    idx = np.random.permutation(x_data.shape[0])
    x_data = x_data[idx]
    y_data = y_data[idx]
    return x_data[:num], y_data[:num]
示例#12
0
def reload_mimic_seq(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num):
    train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz')
    train_x = train_x.reshape((train_x.shape[0], seq_num, -1))
    valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz')
    valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1))
    test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz')
    test_x = test_x.reshape((test_x.shape[0], seq_num, -1))
    train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv')
    valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv')
    test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv')
    if valid:
        test_x = valid_x
        test_y = valid_y
    else:
        train_x = np.concatenate((train_x, valid_x), axis=0)
        train_y = np.concatenate((train_y, valid_y), axis=0)
    if train_percent < 0.8:
        new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent)
        train_x = train_x[:new_training_size]
        train_y = train_y[:new_training_size]
    return train_x, train_y, test_x, test_y
示例#13
0
文件: lda.py 项目: ykpku/LDA-Reg-TKDE
 def save_phi_alpha_theta_topicdistrib(self):
     plsa = ""
     percent = ""
     if self.PLSA:
         plsa = "PLSA"
     if self.corpus_percent != 1.0:
         percent = "_" + str(self.corpus_percent) + "percent"
     if self.mimic_movie_wiki == 0:
         CsvUtility.write_array2csv(
             self.get_alpha(), self.output_path,
             'MIMIC_alpha_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_mimic_phi(MIMICP.feature_index_file),
             self.output_path,
             'MIMIC_phi_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_theta(), self.output_path,
             'MIMIC_theta_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_topic_distrib_of_word(), self.output_path,
             'MIMIC_topic_distrib_' + str(self.topic_num) + plsa + percent +
             '.csv')
     elif self.mimic_movie_wiki == 1:
         CsvUtility.write_array2csv(
             self.get_alpha(), self.output_path, 'MovieReview_alpha_' +
             str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_movie_phi(MOVIEP.feature_index_file),
             self.output_path, 'MovieReview_phi_' + str(self.topic_num) +
             plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_theta(), self.output_path, 'MovieReview_theta_' +
             str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_topic_distrib_of_word(), self.output_path,
             'MovieReview_topic_distrib_' + str(self.topic_num) + plsa +
             percent + '.csv')
     else:
         CsvUtility.write_array2csv(
             self.get_alpha(), self.output_path,
             'Wiki_alpha_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_movie_phi(MOVIEP.feature_index_file),
             self.output_path,
             'Wiki_phi_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_theta(), self.output_path,
             'Wiki_theta_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_topic_distrib_of_word(), self.output_path,
             'Wiki_topic_distrib_' + str(self.topic_num) + plsa + percent +
             '.csv')
示例#14
0
def get_mimic_sequence_data(data_pickle_path, word_dict_path, predict_dict_path, seq_max, vec_len, sgns_path, save_path, save=False):
    all_events = CsvUtility.read_pickle(data_pickle_path, 'r')
    word_dict = CsvUtility.read_pickle(word_dict_path, 'r')
    predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r')
    # pprint(all_events[0])
    print "word_dict:", len(word_dict), "predict_dict:", len(predict_dict), "all_events:", len(all_events)
    feature_dict = WordIndexMap(list(word_dict))
    pred_dict = WordIndexMap(list(predict_dict))

    filter_event = __filter_events(all_events=all_events)
    sgns_model = get_sgns_embedding('MIMIC', sgns_path)

    feature_tensor = np.zeros((len(filter_event), seq_max, vec_len))
    feature_count_tensor = np.zeros((len(filter_event), seq_max))
    result_tensor = np.zeros((len(filter_event), len(predict_dict)))

    find_nan = {}
    for i_iter, event_line in enumerate(filter_event):
        for seq_iter, sequence_item in enumerate(event_line[0]):
            for event_code in sequence_item:
                if event_code in sgns_model:

                    feature_tensor[i_iter][seq_iter] += sgns_model[event_code]
                    feature_count_tensor[i_iter][seq_iter] += 1
                else:
                    if event_code in find_nan:
                        find_nan[event_code] += 1
                    else:
                        find_nan[event_code] = 1
        for pred_item in event_line[1]:
            result_tensor[i_iter][pred_dict.get_index_by_word(pred_item)] = 1

        if i_iter % 1000 == 0:
            print 'complete {0} of {1}'.format(i_iter, len(filter_event))
    print 'words not in docs:', len(find_nan)
    if save:
        CsvUtility.write_dict2csv(feature_dict.get_word2index(), save_path, 'feature2index_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_dict2csv(pred_dict.get_word2index(), save_path, 'predict2index_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_array2csv(feature_tensor.reshape((feature_tensor.shape[0], -1)), save_path, 'feature_matrix_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_array2csv(result_tensor.reshape((result_tensor.shape[0], -1)), save_path, 'result_matrix_seq_embedding'+str(vec_len)+'.csv')

    return feature_tensor, feature_count_tensor, result_tensor
示例#15
0
    # filter_contend = {}
    # filter_index = 0
    # for i in res:
    #     for a in i.get():
    #         filter_contend[str(filter_index)] = ' '.join(a)
    #         filter_index += 1
    # CsvUtility.write_dict2csv(filter_contend, sentence_dir, 'selected_movie_review_docs4LDA.csv')


def get_filter_data(path):
    get_con = []
    for process_index in range(1):
        for file_iter in range(6):
            name = str(process_index) + "process_" + str(file_iter +
                                                         1) + "00iter_text.csv"
            content = CsvUtility.read_norm_array_csv(path, name)
            # print(len(content))
            get_con.extend(content)
    print(" content number : ", len(get_con))
    return get_con[:100000]
    # print(content[0])


if __name__ == '__main__':
    # _load_and_process_metadata("/home1/yk/wikipedia_dataset/text", "/home1/yk/Movie_Review_data", num_processor=20)
    contend = get_filter_data("/home1/yk/wikipedia_dataset/filter")
    name = "wiki_text.csv"
    CsvUtility.write_norm_array2csv(contend,
                                    "/home1/yk/wikipedia_dataset/filter", name)
    pass
示例#16
0
def generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_valid_perc=0.8, shuffle=False, save=False):

    feature_tensor, feature_count_tensor, result_tensor = get_mimic_sequence_data(data_pickle_path=base_path+'after_instance_seq.pkl',
                                                            word_dict_path=base_path+'event_instance_dict_seq.pkl',
                                                            predict_dict_path=base_path+'predict_diags_dict_seq.pkl',
                                                            seq_max=seq_max,
                                                            vec_len=vec_len,
                                                            sgns_path=sgns_path,
                                                            save_path=save_path,
                                                            save=False)
    feature_tensor = __get_aggregate_seq(feature_tensor, feature_count_tensor, seq_not)
    x = feature_tensor.reshape((feature_tensor.shape[0], -1))
    y = result_tensor.reshape((result_tensor.shape[0], -1))
    train_size = int(x.shape[0] * train_valid_perc)
    # for further extention
    name_append = 'SGNS'
    # shuffle the train set
    if shuffle:
        idx = np.random.permutation(x.shape[0])
        CsvUtility.write_array2csv(idx, base_path, 'random_idx_seq_' + name_append + '.csv')
    else:
        idx = CsvUtility.read_array_from_csv(base_path, 'random_idx_seq_' + name_append + '.csv')
    x_train = x[idx]
    y_train = y[idx]

    training_x = x_train[:train_size]
    training_y = y_train[:train_size]
    testing_x = x_train[train_size:]
    testing_y = y_train[train_size:]
    # print training_x.shape
    # print training_y.shape
    # print testing_x.shape
    # print testing_y.shape
    # print len(idx)
    if save:
        CsvUtility.write_array2csv(training_x, save_path, 'formal_train_valid_x_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(training_y, save_path, 'formal_train_valid_y_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_'+name_append+'.csv')
    return training_x, training_y, testing_x, testing_y
示例#17
0
def run():
    base_path = "/home1/yk/experiments_TKDE/major_revision/"
    # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl"
    # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv"
    # time_prefix_list = ['#2020-11-29 15_28_25#', '#2020-11-29 15_26_23#', '#2020-11-29 15_26_31#', '#2020-11-29 15_28_09#', '#2020-11-29 15_30_14#']
    # topic_list = ['20', '50', '100', '200', '500']
    time_prefix_list = [
        '#2020-11-30 22_36_35#', '#2020-11-30 22_36_56#',
        '#2020-11-30 22_03_35#', '#2020-11-30 22_04_03#',
        '#2020-11-30 22_09_12#'
    ]
    topic_list = [20, 50, 100, 200, 500]
    model_file_name = '_lstm_model_mimic_mlp_ldareg_1layer_0.8_topic'
    sita_file_name = '_sita_mimic_mlp_ldareg_1layer_0.8_topic'
    entropy_matrix = []
    F1_matrix = []
    # common_label_ratio_matrix = []

    neuron_num = 10
    mean_f1 = 0.0
    while mean_f1 <= 0.599:
        for i in [2]:
            model_file = time_prefix_list[i] + model_file_name + str(
                topic_list[i]) + '.pkl'
            sita_file = time_prefix_list[i] + sita_file_name + str(
                topic_list[i]) + '.csv'
            # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl"
            # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv"
            r1, r2 = get_neron_labels(base_path=base_path,
                                      model_file=model_file,
                                      top_n_label=2,
                                      patient_num=100000,
                                      neuron_num=neuron_num,
                                      topic_num=topic_list[i])
            while len(r1) < 128:
                print("not cover all neurons !")
                neuron_num += 2
                r1, r2 = get_neron_labels(base_path=base_path,
                                          model_file=model_file,
                                          top_n_label=2,
                                          patient_num=100000,
                                          neuron_num=neuron_num,
                                          topic_num=topic_list[i])

            entropy_cluster = [topic_list[i]]
            f1_cluster = [topic_list[i]]
            common_label_ratio = [topic_list[i]]
            for tn in [2, 5, 10, 20, 30]:
                label_cluster_matrix, cluster_re = cluster_neurons(
                    neuron_labels=r1,
                    base_path=base_path,
                    sita_file=sita_file,
                    cluster_num=tn)
                cce = cal_class_entropy(
                    label_cluster_matrix=label_cluster_matrix, neuron_num=128)
                cf = cal_F1(label_cluster_matrix=label_cluster_matrix,
                            neuron_num=128,
                            cluster_re=cluster_re)
                entropy_cluster.append(cce)
                f1_cluster.append(cf)
                mean_f1 += cf
            entropy_matrix.append(entropy_cluster)
            mean_f1 /= 5.0
            F1_matrix.append(f1_cluster)
            # common_count, common_ratio = cal_common_label_ratio(neuron_labels=r1, base_path=base_path, sita_file=sita_file)
            # common_label_ratio.append(common_count)
            # common_label_ratio.append(common_ratio)
            # common_label_ratio_matrix.append(common_label_ratio)
    time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S",
                                    time.localtime()) + '#_'
    CsvUtility.write_array2csv(
        entropy_matrix, base_path,
        time_code + "class_entropy_pa100000_topneu10_toplab2_tuneTopic100.csv")
    CsvUtility.write_array2csv(
        F1_matrix, base_path,
        time_code + "f1_pa100000_topneu10_toplab2_tuneTopic100.csv")
示例#18
0
def get_train_validation_test_seq(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc=0.8, shuffle=False, save=False):
    training_x, training_y, testing_x, testing_y = generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc, shuffle, save)
    training_size = int(training_x.shape[0] * 0.8)
    formal_training_x = training_x[:training_size]
    formal_training_y = training_y[:training_size]
    validation_x = training_x[training_size:]
    validation_y = training_y[training_size:]
    print formal_training_x.shape
    print formal_training_y.shape
    print validation_x.shape
    print validation_y.shape
    print testing_x.shape
    print testing_y.shape
    # for further extention
    embedding_append = 'lda_sgns500_window50'

    CsvUtility.write_array2csv(formal_training_x, save_path, 'formal_train_x_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(formal_training_y, save_path, 'formal_train_y_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(validation_x, save_path, 'formal_valid_x_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(validation_y, save_path, 'formal_valid_y_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_' + embedding_append + '.csv')
    return training_x, training_y, validation_x, validation_y, testing_x, testing_y