def get_mimic_sequence_data(data_pickle_path, word_dict_path, predict_dict_path, seq_max, vec_len, sgns_path, save_path, save=False): all_events = CsvUtility.read_pickle(data_pickle_path, 'r') word_dict = CsvUtility.read_pickle(word_dict_path, 'r') predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r') # pprint(all_events[0]) print "word_dict:", len(word_dict), "predict_dict:", len(predict_dict), "all_events:", len(all_events) feature_dict = WordIndexMap(list(word_dict)) pred_dict = WordIndexMap(list(predict_dict)) filter_event = __filter_events(all_events=all_events) sgns_model = get_sgns_embedding('MIMIC', sgns_path) feature_tensor = np.zeros((len(filter_event), seq_max, vec_len)) feature_count_tensor = np.zeros((len(filter_event), seq_max)) result_tensor = np.zeros((len(filter_event), len(predict_dict))) find_nan = {} for i_iter, event_line in enumerate(filter_event): for seq_iter, sequence_item in enumerate(event_line[0]): for event_code in sequence_item: if event_code in sgns_model: feature_tensor[i_iter][seq_iter] += sgns_model[event_code] feature_count_tensor[i_iter][seq_iter] += 1 else: if event_code in find_nan: find_nan[event_code] += 1 else: find_nan[event_code] = 1 for pred_item in event_line[1]: result_tensor[i_iter][pred_dict.get_index_by_word(pred_item)] = 1 if i_iter % 1000 == 0: print 'complete {0} of {1}'.format(i_iter, len(filter_event)) print 'words not in docs:', len(find_nan) if save: CsvUtility.write_dict2csv(feature_dict.get_word2index(), save_path, 'feature2index_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_dict2csv(pred_dict.get_word2index(), save_path, 'predict2index_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_array2csv(feature_tensor.reshape((feature_tensor.shape[0], -1)), save_path, 'feature_matrix_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_array2csv(result_tensor.reshape((result_tensor.shape[0], -1)), save_path, 'result_matrix_seq_embedding'+str(vec_len)+'.csv') return feature_tensor, feature_count_tensor, result_tensor
def generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_valid_perc=0.8, shuffle=False, save=False): feature_tensor, feature_count_tensor, result_tensor = get_mimic_sequence_data(data_pickle_path=base_path+'after_instance_seq.pkl', word_dict_path=base_path+'event_instance_dict_seq.pkl', predict_dict_path=base_path+'predict_diags_dict_seq.pkl', seq_max=seq_max, vec_len=vec_len, sgns_path=sgns_path, save_path=save_path, save=False) feature_tensor = __get_aggregate_seq(feature_tensor, feature_count_tensor, seq_not) x = feature_tensor.reshape((feature_tensor.shape[0], -1)) y = result_tensor.reshape((result_tensor.shape[0], -1)) train_size = int(x.shape[0] * train_valid_perc) # for further extention name_append = 'SGNS' # shuffle the train set if shuffle: idx = np.random.permutation(x.shape[0]) CsvUtility.write_array2csv(idx, base_path, 'random_idx_seq_' + name_append + '.csv') else: idx = CsvUtility.read_array_from_csv(base_path, 'random_idx_seq_' + name_append + '.csv') x_train = x[idx] y_train = y[idx] training_x = x_train[:train_size] training_y = y_train[:train_size] testing_x = x_train[train_size:] testing_y = y_train[train_size:] # print training_x.shape # print training_y.shape # print testing_x.shape # print testing_y.shape # print len(idx) if save: CsvUtility.write_array2csv(training_x, save_path, 'formal_train_valid_x_seq_'+name_append+'.csv') CsvUtility.write_array2csv(training_y, save_path, 'formal_train_valid_y_seq_'+name_append+'.csv') CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_'+name_append+'.csv') CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_'+name_append+'.csv') return training_x, training_y, testing_x, testing_y
def save_results(net, sita, result_epoch, time_epochs, run_p): save_path = run_p.save_path save_name = run_p.save_name time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '#_' used_params = [run_p] if run_p.mimic0_movie1_wiki2 == 0: used_params.append(MIMICP) else: used_params.append(MOVIEP) if run_p.onehot0_embedding != 0: used_params.append(EMBEDP) if run_p.lm_lda_l2 == 0: used_params.append(LSTMP) used_params.append(LDAP) used_params.append(ldaregP) elif run_p.lm_lda_l2 == 1: used_params.append(LSTMP) elif run_p.lm_lda_l2 == 2: used_params.append(MLPP) used_params.append(LDAP) used_params.append(ldaregP) else: used_params.append(MLPP) for param_item in used_params: param_item.save_self(save_path, time_code + 'params_' + save_name + '.csv') # save net torch.save(net, save_path + time_code + 'lstm_model_' + save_name + '.pkl') # save topic distribution if sita.ndim > 1: CsvUtility.write_array2csv(sita, save_path, time_code + 'sita_' + save_name + '.csv') # save results metric_result, aucs = __split_metrics(result_epoch) CsvUtility.write_list2csv(metric_result, save_path, time_code + 'metrics_' + save_name + '.csv') if len(aucs) > 0: CsvUtility.write_array2csv(aucs, save_path, time_code + 'aucs_' + save_name + '.csv') # save time consuming CsvUtility.write_array2csv(time_epochs, save_path, time_code + 'time_epochs_' + save_name + '.csv')
def run(): base_path = "/home1/yk/experiments_TKDE/major_revision/" # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl" # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv" # time_prefix_list = ['#2020-11-29 15_28_25#', '#2020-11-29 15_26_23#', '#2020-11-29 15_26_31#', '#2020-11-29 15_28_09#', '#2020-11-29 15_30_14#'] # topic_list = ['20', '50', '100', '200', '500'] time_prefix_list = [ '#2020-11-30 22_36_35#', '#2020-11-30 22_36_56#', '#2020-11-30 22_03_35#', '#2020-11-30 22_04_03#', '#2020-11-30 22_09_12#' ] topic_list = [20, 50, 100, 200, 500] model_file_name = '_lstm_model_mimic_mlp_ldareg_1layer_0.8_topic' sita_file_name = '_sita_mimic_mlp_ldareg_1layer_0.8_topic' entropy_matrix = [] F1_matrix = [] # common_label_ratio_matrix = [] neuron_num = 10 mean_f1 = 0.0 while mean_f1 <= 0.599: for i in [2]: model_file = time_prefix_list[i] + model_file_name + str( topic_list[i]) + '.pkl' sita_file = time_prefix_list[i] + sita_file_name + str( topic_list[i]) + '.csv' # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl" # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv" r1, r2 = get_neron_labels(base_path=base_path, model_file=model_file, top_n_label=2, patient_num=100000, neuron_num=neuron_num, topic_num=topic_list[i]) while len(r1) < 128: print("not cover all neurons !") neuron_num += 2 r1, r2 = get_neron_labels(base_path=base_path, model_file=model_file, top_n_label=2, patient_num=100000, neuron_num=neuron_num, topic_num=topic_list[i]) entropy_cluster = [topic_list[i]] f1_cluster = [topic_list[i]] common_label_ratio = [topic_list[i]] for tn in [2, 5, 10, 20, 30]: label_cluster_matrix, cluster_re = cluster_neurons( neuron_labels=r1, base_path=base_path, sita_file=sita_file, cluster_num=tn) cce = cal_class_entropy( label_cluster_matrix=label_cluster_matrix, neuron_num=128) cf = cal_F1(label_cluster_matrix=label_cluster_matrix, neuron_num=128, cluster_re=cluster_re) entropy_cluster.append(cce) f1_cluster.append(cf) mean_f1 += cf entropy_matrix.append(entropy_cluster) mean_f1 /= 5.0 F1_matrix.append(f1_cluster) # common_count, common_ratio = cal_common_label_ratio(neuron_labels=r1, base_path=base_path, sita_file=sita_file) # common_label_ratio.append(common_count) # common_label_ratio.append(common_ratio) # common_label_ratio_matrix.append(common_label_ratio) time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '#_' CsvUtility.write_array2csv( entropy_matrix, base_path, time_code + "class_entropy_pa100000_topneu10_toplab2_tuneTopic100.csv") CsvUtility.write_array2csv( F1_matrix, base_path, time_code + "f1_pa100000_topneu10_toplab2_tuneTopic100.csv")
def save_phi_alpha_theta_topicdistrib(self): plsa = "" percent = "" if self.PLSA: plsa = "PLSA" if self.corpus_percent != 1.0: percent = "_" + str(self.corpus_percent) + "percent" if self.mimic_movie_wiki == 0: CsvUtility.write_array2csv( self.get_alpha(), self.output_path, 'MIMIC_alpha_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_mimic_phi(MIMICP.feature_index_file), self.output_path, 'MIMIC_phi_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_theta(), self.output_path, 'MIMIC_theta_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_topic_distrib_of_word(), self.output_path, 'MIMIC_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv') elif self.mimic_movie_wiki == 1: CsvUtility.write_array2csv( self.get_alpha(), self.output_path, 'MovieReview_alpha_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_movie_phi(MOVIEP.feature_index_file), self.output_path, 'MovieReview_phi_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_theta(), self.output_path, 'MovieReview_theta_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_topic_distrib_of_word(), self.output_path, 'MovieReview_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv') else: CsvUtility.write_array2csv( self.get_alpha(), self.output_path, 'Wiki_alpha_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_movie_phi(MOVIEP.feature_index_file), self.output_path, 'Wiki_phi_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_theta(), self.output_path, 'Wiki_theta_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_topic_distrib_of_word(), self.output_path, 'Wiki_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv')
def get_train_validation_test_seq(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc=0.8, shuffle=False, save=False): training_x, training_y, testing_x, testing_y = generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc, shuffle, save) training_size = int(training_x.shape[0] * 0.8) formal_training_x = training_x[:training_size] formal_training_y = training_y[:training_size] validation_x = training_x[training_size:] validation_y = training_y[training_size:] print formal_training_x.shape print formal_training_y.shape print validation_x.shape print validation_y.shape print testing_x.shape print testing_y.shape # for further extention embedding_append = 'lda_sgns500_window50' CsvUtility.write_array2csv(formal_training_x, save_path, 'formal_train_x_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(formal_training_y, save_path, 'formal_train_y_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(validation_x, save_path, 'formal_valid_x_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(validation_y, save_path, 'formal_valid_y_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_' + embedding_append + '.csv') return training_x, training_y, validation_x, validation_y, testing_x, testing_y