def __read_phi_alpha_theta_byname(self, name): plsa = "" percent = "" if self.PLSA: plsa = "_PLSA" if self.corpus_percent != 1.0: percent = "_" + str(self.corpus_percent) + "percent" alpha = CsvUtility.read_array_from_csv( self.doc_path, name + 'alpha_' + str(self.topic_num) + plsa + percent + '.csv') phi = CsvUtility.read_array_from_csv( self.doc_path, name + 'phi_' + str(self.topic_num) + plsa + percent + '.csv') # theta = CsvUtility.read_array_from_csv(self.output_path, name+'theta_' + str(self.topic_num) + plsa + percent + '.csv') return alpha, phi
def get_some_instance(file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, num=10): train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz') train_x = train_x.reshape((train_x.shape[0], seq_num, -1)) valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz') valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1)) test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz') test_x = test_x.reshape((test_x.shape[0], seq_num, -1)) train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv') valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv') test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv') x_data = np.concatenate((train_x, valid_x, test_x), axis=0) y_data = np.concatenate((train_y, valid_y, test_y), axis=0) idx = np.random.permutation(x_data.shape[0]) x_data = x_data[idx] y_data = y_data[idx] return x_data[:num], y_data[:num]
def reload_mimic_seq(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num): train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz') train_x = train_x.reshape((train_x.shape[0], seq_num, -1)) valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz') valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1)) test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz') test_x = test_x.reshape((test_x.shape[0], seq_num, -1)) train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv') valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv') test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv') if valid: test_x = valid_x test_y = valid_y else: train_x = np.concatenate((train_x, valid_x), axis=0) train_y = np.concatenate((train_y, valid_y), axis=0) if train_percent < 0.8: new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent) train_x = train_x[:new_training_size] train_y = train_y[:new_training_size] return train_x, train_y, test_x, test_y
def reload_mimic_embedding(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, embedding_type=EMBEDP.embedding_type, veclen=EMBEDP.veclen, window=EMBEDP.window): if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns': embedding_name = 'lda_sgns' + str(veclen) + '_window' + str(window) else: embedding_name = embedding_type + str(veclen) + '_window' + str(window) train_x = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv') train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq_' + embedding_name + '.csv') valid_x = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv') valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq_' + embedding_name + '.csv') test_x = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv') test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq_' + embedding_name + '.csv') train_x = train_x.reshape((train_x.shape[0], seq_num, -1)) valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1)) test_x = test_x.reshape((test_x.shape[0], seq_num, -1)) if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns': embedding_name = 'embedding_skipgram' + str(veclen) + '_window' + str(window) train_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv') valid_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv') test_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv') train_x_sg = train_x_sg.reshape((train_x_sg.shape[0], seq_num, -1)) valid_x_sg = valid_x_sg.reshape((valid_x_sg.shape[0], seq_num, -1)) test_x_sg = test_x_sg.reshape((test_x_sg.shape[0], seq_num, -1)) if embedding_type == 'sg_add_sgns': train_x = train_x + train_x_sg valid_x = valid_x + valid_x_sg test_x = test_x + test_x_sg if embedding_type == 'sg_cancat_sgns': train_x = np.concatenate((train_x, train_x_sg), axis=2) valid_x = np.concatenate((valid_x, valid_x_sg), axis=2) test_x = np.concatenate((test_x, test_x_sg), axis=2) if valid: test_x = valid_x test_y = valid_y else: train_x = np.concatenate((train_x, valid_x), axis=0) train_y = np.concatenate((train_y, valid_y), axis=0) if train_percent < 0.8: new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent) train_x = train_x[:new_training_size] train_y = train_y[:new_training_size] return train_x, train_y, test_x, test_y
def read_topic_distrib(self): if self.mimic_movie_wiki == 0: name = "MIMIC" elif self.mimic_movie_wiki == 1: name = "MovieReview" else: name = "Wiki" plsa = "" if self.PLSA: plsa = "_PLSA" percent = "" if self.corpus_percent != 1.0: percent = "_" + str(self.corpus_percent) + "percent" topic_distrib = CsvUtility.read_array_from_csv( self.output_path, name + '_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv') return topic_distrib
def cluster_neurons( neuron_labels, base_path="/home1/yk/experiments_TKDE/major_revision/", sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv", cluster_num=10): sita = CsvUtility.read_array_from_csv(base_path, sita_file) # print(sita[:3]) # print(sita.shape) sc = SpectralClustering(cluster_num, assign_labels='discretize', random_state=random.randint(0, 10)) sc.fit(sita) # print(sc.labels_) label_cluster_matrix = np.zeros((80, cluster_num)) for i, cluster in enumerate(sc.labels_): neuron_i_labels = neuron_labels[i] for nil in neuron_i_labels: label_cluster_matrix[nil][cluster] += 1 return label_cluster_matrix, sc.labels_
def cal_common_label_ratio( neuron_labels, base_path="/home1/yk/experiments_TKDE/major_revision/", sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv" ): sita = CsvUtility.read_array_from_csv(base_path, sita_file) sita_cos_sim = cosine_similarity(sita) consistent_neurons_num = 0 for neuron_i, neuron_sims in enumerate(sita_cos_sim): top_k, top_v = get_list_sort_index(neuron_sims, 4) top_k_label_count = {} for top_k_i in top_k: k_i_labels = neuron_labels[top_k_i] for l in k_i_labels: top_k_label_count[l] = top_k_label_count.setdefault(l, 0) + 1 for label_count_val in top_k_label_count.values(): if label_count_val >= 3: consistent_neurons_num += 1 break return consistent_neurons_num, consistent_neurons_num * 1.0 / len( neuron_labels)
def generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_valid_perc=0.8, shuffle=False, save=False): feature_tensor, feature_count_tensor, result_tensor = get_mimic_sequence_data(data_pickle_path=base_path+'after_instance_seq.pkl', word_dict_path=base_path+'event_instance_dict_seq.pkl', predict_dict_path=base_path+'predict_diags_dict_seq.pkl', seq_max=seq_max, vec_len=vec_len, sgns_path=sgns_path, save_path=save_path, save=False) feature_tensor = __get_aggregate_seq(feature_tensor, feature_count_tensor, seq_not) x = feature_tensor.reshape((feature_tensor.shape[0], -1)) y = result_tensor.reshape((result_tensor.shape[0], -1)) train_size = int(x.shape[0] * train_valid_perc) # for further extention name_append = 'SGNS' # shuffle the train set if shuffle: idx = np.random.permutation(x.shape[0]) CsvUtility.write_array2csv(idx, base_path, 'random_idx_seq_' + name_append + '.csv') else: idx = CsvUtility.read_array_from_csv(base_path, 'random_idx_seq_' + name_append + '.csv') x_train = x[idx] y_train = y[idx] training_x = x_train[:train_size] training_y = y_train[:train_size] testing_x = x_train[train_size:] testing_y = y_train[train_size:] # print training_x.shape # print training_y.shape # print testing_x.shape # print testing_y.shape # print len(idx) if save: CsvUtility.write_array2csv(training_x, save_path, 'formal_train_valid_x_seq_'+name_append+'.csv') CsvUtility.write_array2csv(training_y, save_path, 'formal_train_valid_y_seq_'+name_append+'.csv') CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_'+name_append+'.csv') CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_'+name_append+'.csv') return training_x, training_y, testing_x, testing_y