示例#1
0
def get_mimic_sequence_data(data_pickle_path, word_dict_path, predict_dict_path, seq_max, vec_len, sgns_path, save_path, save=False):
    all_events = CsvUtility.read_pickle(data_pickle_path, 'r')
    word_dict = CsvUtility.read_pickle(word_dict_path, 'r')
    predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r')
    # pprint(all_events[0])
    print "word_dict:", len(word_dict), "predict_dict:", len(predict_dict), "all_events:", len(all_events)
    feature_dict = WordIndexMap(list(word_dict))
    pred_dict = WordIndexMap(list(predict_dict))

    filter_event = __filter_events(all_events=all_events)
    sgns_model = get_sgns_embedding('MIMIC', sgns_path)

    feature_tensor = np.zeros((len(filter_event), seq_max, vec_len))
    feature_count_tensor = np.zeros((len(filter_event), seq_max))
    result_tensor = np.zeros((len(filter_event), len(predict_dict)))

    find_nan = {}
    for i_iter, event_line in enumerate(filter_event):
        for seq_iter, sequence_item in enumerate(event_line[0]):
            for event_code in sequence_item:
                if event_code in sgns_model:

                    feature_tensor[i_iter][seq_iter] += sgns_model[event_code]
                    feature_count_tensor[i_iter][seq_iter] += 1
                else:
                    if event_code in find_nan:
                        find_nan[event_code] += 1
                    else:
                        find_nan[event_code] = 1
        for pred_item in event_line[1]:
            result_tensor[i_iter][pred_dict.get_index_by_word(pred_item)] = 1

        if i_iter % 1000 == 0:
            print 'complete {0} of {1}'.format(i_iter, len(filter_event))
    print 'words not in docs:', len(find_nan)
    if save:
        CsvUtility.write_dict2csv(feature_dict.get_word2index(), save_path, 'feature2index_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_dict2csv(pred_dict.get_word2index(), save_path, 'predict2index_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_array2csv(feature_tensor.reshape((feature_tensor.shape[0], -1)), save_path, 'feature_matrix_seq_embedding'+str(vec_len)+'.csv')
        CsvUtility.write_array2csv(result_tensor.reshape((result_tensor.shape[0], -1)), save_path, 'result_matrix_seq_embedding'+str(vec_len)+'.csv')

    return feature_tensor, feature_count_tensor, result_tensor
示例#2
0
def generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_valid_perc=0.8, shuffle=False, save=False):

    feature_tensor, feature_count_tensor, result_tensor = get_mimic_sequence_data(data_pickle_path=base_path+'after_instance_seq.pkl',
                                                            word_dict_path=base_path+'event_instance_dict_seq.pkl',
                                                            predict_dict_path=base_path+'predict_diags_dict_seq.pkl',
                                                            seq_max=seq_max,
                                                            vec_len=vec_len,
                                                            sgns_path=sgns_path,
                                                            save_path=save_path,
                                                            save=False)
    feature_tensor = __get_aggregate_seq(feature_tensor, feature_count_tensor, seq_not)
    x = feature_tensor.reshape((feature_tensor.shape[0], -1))
    y = result_tensor.reshape((result_tensor.shape[0], -1))
    train_size = int(x.shape[0] * train_valid_perc)
    # for further extention
    name_append = 'SGNS'
    # shuffle the train set
    if shuffle:
        idx = np.random.permutation(x.shape[0])
        CsvUtility.write_array2csv(idx, base_path, 'random_idx_seq_' + name_append + '.csv')
    else:
        idx = CsvUtility.read_array_from_csv(base_path, 'random_idx_seq_' + name_append + '.csv')
    x_train = x[idx]
    y_train = y[idx]

    training_x = x_train[:train_size]
    training_y = y_train[:train_size]
    testing_x = x_train[train_size:]
    testing_y = y_train[train_size:]
    # print training_x.shape
    # print training_y.shape
    # print testing_x.shape
    # print testing_y.shape
    # print len(idx)
    if save:
        CsvUtility.write_array2csv(training_x, save_path, 'formal_train_valid_x_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(training_y, save_path, 'formal_train_valid_y_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_'+name_append+'.csv')
        CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_'+name_append+'.csv')
    return training_x, training_y, testing_x, testing_y
示例#3
0
def save_results(net, sita, result_epoch, time_epochs, run_p):
    save_path = run_p.save_path
    save_name = run_p.save_name

    time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S",
                                    time.localtime()) + '#_'

    used_params = [run_p]
    if run_p.mimic0_movie1_wiki2 == 0:
        used_params.append(MIMICP)
    else:
        used_params.append(MOVIEP)
    if run_p.onehot0_embedding != 0:
        used_params.append(EMBEDP)
    if run_p.lm_lda_l2 == 0:
        used_params.append(LSTMP)
        used_params.append(LDAP)
        used_params.append(ldaregP)
    elif run_p.lm_lda_l2 == 1:
        used_params.append(LSTMP)
    elif run_p.lm_lda_l2 == 2:
        used_params.append(MLPP)
        used_params.append(LDAP)
        used_params.append(ldaregP)
    else:
        used_params.append(MLPP)
    for param_item in used_params:
        param_item.save_self(save_path,
                             time_code + 'params_' + save_name + '.csv')

    # save net
    torch.save(net, save_path + time_code + 'lstm_model_' + save_name + '.pkl')

    # save topic distribution
    if sita.ndim > 1:
        CsvUtility.write_array2csv(sita, save_path,
                                   time_code + 'sita_' + save_name + '.csv')

    # save results
    metric_result, aucs = __split_metrics(result_epoch)
    CsvUtility.write_list2csv(metric_result, save_path,
                              time_code + 'metrics_' + save_name + '.csv')
    if len(aucs) > 0:
        CsvUtility.write_array2csv(aucs, save_path,
                                   time_code + 'aucs_' + save_name + '.csv')

    # save time consuming
    CsvUtility.write_array2csv(time_epochs, save_path,
                               time_code + 'time_epochs_' + save_name + '.csv')
示例#4
0
def run():
    base_path = "/home1/yk/experiments_TKDE/major_revision/"
    # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl"
    # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv"
    # time_prefix_list = ['#2020-11-29 15_28_25#', '#2020-11-29 15_26_23#', '#2020-11-29 15_26_31#', '#2020-11-29 15_28_09#', '#2020-11-29 15_30_14#']
    # topic_list = ['20', '50', '100', '200', '500']
    time_prefix_list = [
        '#2020-11-30 22_36_35#', '#2020-11-30 22_36_56#',
        '#2020-11-30 22_03_35#', '#2020-11-30 22_04_03#',
        '#2020-11-30 22_09_12#'
    ]
    topic_list = [20, 50, 100, 200, 500]
    model_file_name = '_lstm_model_mimic_mlp_ldareg_1layer_0.8_topic'
    sita_file_name = '_sita_mimic_mlp_ldareg_1layer_0.8_topic'
    entropy_matrix = []
    F1_matrix = []
    # common_label_ratio_matrix = []

    neuron_num = 10
    mean_f1 = 0.0
    while mean_f1 <= 0.599:
        for i in [2]:
            model_file = time_prefix_list[i] + model_file_name + str(
                topic_list[i]) + '.pkl'
            sita_file = time_prefix_list[i] + sita_file_name + str(
                topic_list[i]) + '.csv'
            # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl"
            # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv"
            r1, r2 = get_neron_labels(base_path=base_path,
                                      model_file=model_file,
                                      top_n_label=2,
                                      patient_num=100000,
                                      neuron_num=neuron_num,
                                      topic_num=topic_list[i])
            while len(r1) < 128:
                print("not cover all neurons !")
                neuron_num += 2
                r1, r2 = get_neron_labels(base_path=base_path,
                                          model_file=model_file,
                                          top_n_label=2,
                                          patient_num=100000,
                                          neuron_num=neuron_num,
                                          topic_num=topic_list[i])

            entropy_cluster = [topic_list[i]]
            f1_cluster = [topic_list[i]]
            common_label_ratio = [topic_list[i]]
            for tn in [2, 5, 10, 20, 30]:
                label_cluster_matrix, cluster_re = cluster_neurons(
                    neuron_labels=r1,
                    base_path=base_path,
                    sita_file=sita_file,
                    cluster_num=tn)
                cce = cal_class_entropy(
                    label_cluster_matrix=label_cluster_matrix, neuron_num=128)
                cf = cal_F1(label_cluster_matrix=label_cluster_matrix,
                            neuron_num=128,
                            cluster_re=cluster_re)
                entropy_cluster.append(cce)
                f1_cluster.append(cf)
                mean_f1 += cf
            entropy_matrix.append(entropy_cluster)
            mean_f1 /= 5.0
            F1_matrix.append(f1_cluster)
            # common_count, common_ratio = cal_common_label_ratio(neuron_labels=r1, base_path=base_path, sita_file=sita_file)
            # common_label_ratio.append(common_count)
            # common_label_ratio.append(common_ratio)
            # common_label_ratio_matrix.append(common_label_ratio)
    time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S",
                                    time.localtime()) + '#_'
    CsvUtility.write_array2csv(
        entropy_matrix, base_path,
        time_code + "class_entropy_pa100000_topneu10_toplab2_tuneTopic100.csv")
    CsvUtility.write_array2csv(
        F1_matrix, base_path,
        time_code + "f1_pa100000_topneu10_toplab2_tuneTopic100.csv")
示例#5
0
文件: lda.py 项目: ykpku/LDA-Reg-TKDE
 def save_phi_alpha_theta_topicdistrib(self):
     plsa = ""
     percent = ""
     if self.PLSA:
         plsa = "PLSA"
     if self.corpus_percent != 1.0:
         percent = "_" + str(self.corpus_percent) + "percent"
     if self.mimic_movie_wiki == 0:
         CsvUtility.write_array2csv(
             self.get_alpha(), self.output_path,
             'MIMIC_alpha_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_mimic_phi(MIMICP.feature_index_file),
             self.output_path,
             'MIMIC_phi_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_theta(), self.output_path,
             'MIMIC_theta_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_topic_distrib_of_word(), self.output_path,
             'MIMIC_topic_distrib_' + str(self.topic_num) + plsa + percent +
             '.csv')
     elif self.mimic_movie_wiki == 1:
         CsvUtility.write_array2csv(
             self.get_alpha(), self.output_path, 'MovieReview_alpha_' +
             str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_movie_phi(MOVIEP.feature_index_file),
             self.output_path, 'MovieReview_phi_' + str(self.topic_num) +
             plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_theta(), self.output_path, 'MovieReview_theta_' +
             str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_topic_distrib_of_word(), self.output_path,
             'MovieReview_topic_distrib_' + str(self.topic_num) + plsa +
             percent + '.csv')
     else:
         CsvUtility.write_array2csv(
             self.get_alpha(), self.output_path,
             'Wiki_alpha_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_movie_phi(MOVIEP.feature_index_file),
             self.output_path,
             'Wiki_phi_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_theta(), self.output_path,
             'Wiki_theta_' + str(self.topic_num) + plsa + percent + '.csv')
         CsvUtility.write_array2csv(
             self.get_topic_distrib_of_word(), self.output_path,
             'Wiki_topic_distrib_' + str(self.topic_num) + plsa + percent +
             '.csv')
示例#6
0
def get_train_validation_test_seq(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc=0.8, shuffle=False, save=False):
    training_x, training_y, testing_x, testing_y = generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc, shuffle, save)
    training_size = int(training_x.shape[0] * 0.8)
    formal_training_x = training_x[:training_size]
    formal_training_y = training_y[:training_size]
    validation_x = training_x[training_size:]
    validation_y = training_y[training_size:]
    print formal_training_x.shape
    print formal_training_y.shape
    print validation_x.shape
    print validation_y.shape
    print testing_x.shape
    print testing_y.shape
    # for further extention
    embedding_append = 'lda_sgns500_window50'

    CsvUtility.write_array2csv(formal_training_x, save_path, 'formal_train_x_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(formal_training_y, save_path, 'formal_train_y_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(validation_x, save_path, 'formal_valid_x_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(validation_y, save_path, 'formal_valid_y_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_' + embedding_append + '.csv')
    CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_' + embedding_append + '.csv')
    return training_x, training_y, validation_x, validation_y, testing_x, testing_y