def get_corpus_contend_thread(process_index, file_list, word2index, write_path="/home1/yk/wikipedia_dataset/filter", word_kind_limit=50, remove_stopwords=False, stem_words=True, remove_html=True): corpus_contend = [] for file_iter, file_name in enumerate(file_list): tem_data = get_real_word_list(file_name, word2index, word_kind_limit, remove_stopwords, stem_words, remove_html) # print(file_name, 'read ready~', len(tem_data)) corpus_contend.extend(tem_data) if (file_iter + 1) % 10 == 0: print((file_iter + 1), 'file done.') if (file_iter + 1) % 100 == 0: name = process_index + "process_" + str(file_iter + 1) + "iter_text.csv" CsvUtility.write_norm_array2csv(corpus_contend, write_path, name) corpus_contend = [] print(process_index, 'finish~') return corpus_contend
def __read_phi_alpha_theta_byname(self, name): plsa = "" percent = "" if self.PLSA: plsa = "_PLSA" if self.corpus_percent != 1.0: percent = "_" + str(self.corpus_percent) + "percent" alpha = CsvUtility.read_array_from_csv( self.doc_path, name + 'alpha_' + str(self.topic_num) + plsa + percent + '.csv') phi = CsvUtility.read_array_from_csv( self.doc_path, name + 'phi_' + str(self.topic_num) + plsa + percent + '.csv') # theta = CsvUtility.read_array_from_csv(self.output_path, name+'theta_' + str(self.topic_num) + plsa + percent + '.csv') return alpha, phi
def get_filter_data(path): get_con = [] for process_index in range(1): for file_iter in range(6): name = str(process_index) + "process_" + str(file_iter + 1) + "00iter_text.csv" content = CsvUtility.read_norm_array_csv(path, name) # print(len(content)) get_con.extend(content) print(" content number : ", len(get_con)) return get_con[:100000]
def __change_Movie_word_index__(self, gamma, word2index_pickle): feature_word2id = CsvUtility.read_pickle(word2index_pickle, 'r') print 'feature size: ', len(feature_word2id) change_index_result = np.zeros((gamma.shape[0], len(feature_word2id))) for j in range(gamma.shape[1]): new_index = feature_word2id[self.dictionary.__getitem__(j)] for i in range(gamma.shape[0]): change_index_result[i][new_index] += gamma[i][j] if j % 1000 == 0: print j, 'line' print 'after changing the size of result: ', change_index_result.shape return change_index_result
def reload_mimic_embedding(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, embedding_type=EMBEDP.embedding_type, veclen=EMBEDP.veclen, window=EMBEDP.window): if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns': embedding_name = 'lda_sgns' + str(veclen) + '_window' + str(window) else: embedding_name = embedding_type + str(veclen) + '_window' + str(window) train_x = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv') train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq_' + embedding_name + '.csv') valid_x = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv') valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq_' + embedding_name + '.csv') test_x = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv') test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq_' + embedding_name + '.csv') train_x = train_x.reshape((train_x.shape[0], seq_num, -1)) valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1)) test_x = test_x.reshape((test_x.shape[0], seq_num, -1)) if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns': embedding_name = 'embedding_skipgram' + str(veclen) + '_window' + str(window) train_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_train_x_seq_' + embedding_name + '.csv') valid_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_valid_x_seq_' + embedding_name + '.csv') test_x_sg = CsvUtility.read_array_from_csv(file_path, 'formal_test_x_seq_' + embedding_name + '.csv') train_x_sg = train_x_sg.reshape((train_x_sg.shape[0], seq_num, -1)) valid_x_sg = valid_x_sg.reshape((valid_x_sg.shape[0], seq_num, -1)) test_x_sg = test_x_sg.reshape((test_x_sg.shape[0], seq_num, -1)) if embedding_type == 'sg_add_sgns': train_x = train_x + train_x_sg valid_x = valid_x + valid_x_sg test_x = test_x + test_x_sg if embedding_type == 'sg_cancat_sgns': train_x = np.concatenate((train_x, train_x_sg), axis=2) valid_x = np.concatenate((valid_x, valid_x_sg), axis=2) test_x = np.concatenate((test_x, test_x_sg), axis=2) if valid: test_x = valid_x test_y = valid_y else: train_x = np.concatenate((train_x, valid_x), axis=0) train_y = np.concatenate((train_y, valid_y), axis=0) if train_percent < 0.8: new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent) train_x = train_x[:new_training_size] train_y = train_y[:new_training_size] return train_x, train_y, test_x, test_y
def save_results(net, sita, result_epoch, time_epochs, run_p): save_path = run_p.save_path save_name = run_p.save_name time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '#_' used_params = [run_p] if run_p.mimic0_movie1_wiki2 == 0: used_params.append(MIMICP) else: used_params.append(MOVIEP) if run_p.onehot0_embedding != 0: used_params.append(EMBEDP) if run_p.lm_lda_l2 == 0: used_params.append(LSTMP) used_params.append(LDAP) used_params.append(ldaregP) elif run_p.lm_lda_l2 == 1: used_params.append(LSTMP) elif run_p.lm_lda_l2 == 2: used_params.append(MLPP) used_params.append(LDAP) used_params.append(ldaregP) else: used_params.append(MLPP) for param_item in used_params: param_item.save_self(save_path, time_code + 'params_' + save_name + '.csv') # save net torch.save(net, save_path + time_code + 'lstm_model_' + save_name + '.pkl') # save topic distribution if sita.ndim > 1: CsvUtility.write_array2csv(sita, save_path, time_code + 'sita_' + save_name + '.csv') # save results metric_result, aucs = __split_metrics(result_epoch) CsvUtility.write_list2csv(metric_result, save_path, time_code + 'metrics_' + save_name + '.csv') if len(aucs) > 0: CsvUtility.write_array2csv(aucs, save_path, time_code + 'aucs_' + save_name + '.csv') # save time consuming CsvUtility.write_array2csv(time_epochs, save_path, time_code + 'time_epochs_' + save_name + '.csv')
def _load_and_process_metadata(sentence_dir, movie_review_path, num_processor=8): # Extract the filenames. sentence_filenames = glob.glob(os.path.join(sentence_dir, "*/*")) print(len(sentence_filenames)) # print(sentence_filenames[-10:]) # sentence_filenames = sentence_filenames[-20:] word2index = CsvUtility.read_pickle( movie_review_path + '/new_word2index.pkl', 'r') index2word = CsvUtility.read_pickle( movie_review_path + '/new_index2word.pkl', 'r') # Break the files into num_threads batches. spacing = np.linspace(0, len(sentence_filenames), num_processor + 1).astype(np.int) ranges = [] for i in xrange(len(spacing) - 1): ranges.append([spacing[i], spacing[i + 1]]) p = Pool(num_processor) res = [] for i in range(num_processor): start = ranges[i][0] end = ranges[i][1] res.append( p.apply_async(get_corpus_contend_thread, args=(str(i), sentence_filenames[start:end], word2index))) print(str(i) + ' processor started !') # get_corpus_contend(sentence_filenames, word2index) p.close() p.join()
def read_topic_distrib(self): if self.mimic_movie_wiki == 0: name = "MIMIC" elif self.mimic_movie_wiki == 1: name = "MovieReview" else: name = "Wiki" plsa = "" if self.PLSA: plsa = "_PLSA" percent = "" if self.corpus_percent != 1.0: percent = "_" + str(self.corpus_percent) + "percent" topic_distrib = CsvUtility.read_array_from_csv( self.output_path, name + '_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv') return topic_distrib
def cluster_neurons( neuron_labels, base_path="/home1/yk/experiments_TKDE/major_revision/", sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv", cluster_num=10): sita = CsvUtility.read_array_from_csv(base_path, sita_file) # print(sita[:3]) # print(sita.shape) sc = SpectralClustering(cluster_num, assign_labels='discretize', random_state=random.randint(0, 10)) sc.fit(sita) # print(sc.labels_) label_cluster_matrix = np.zeros((80, cluster_num)) for i, cluster in enumerate(sc.labels_): neuron_i_labels = neuron_labels[i] for nil in neuron_i_labels: label_cluster_matrix[nil][cluster] += 1 return label_cluster_matrix, sc.labels_
def cal_common_label_ratio( neuron_labels, base_path="/home1/yk/experiments_TKDE/major_revision/", sita_file="#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv" ): sita = CsvUtility.read_array_from_csv(base_path, sita_file) sita_cos_sim = cosine_similarity(sita) consistent_neurons_num = 0 for neuron_i, neuron_sims in enumerate(sita_cos_sim): top_k, top_v = get_list_sort_index(neuron_sims, 4) top_k_label_count = {} for top_k_i in top_k: k_i_labels = neuron_labels[top_k_i] for l in k_i_labels: top_k_label_count[l] = top_k_label_count.setdefault(l, 0) + 1 for label_count_val in top_k_label_count.values(): if label_count_val >= 3: consistent_neurons_num += 1 break return consistent_neurons_num, consistent_neurons_num * 1.0 / len( neuron_labels)
def get_some_instance(file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num, num=10): train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz') train_x = train_x.reshape((train_x.shape[0], seq_num, -1)) valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz') valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1)) test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz') test_x = test_x.reshape((test_x.shape[0], seq_num, -1)) train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv') valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv') test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv') x_data = np.concatenate((train_x, valid_x, test_x), axis=0) y_data = np.concatenate((train_y, valid_y, test_y), axis=0) idx = np.random.permutation(x_data.shape[0]) x_data = x_data[idx] y_data = y_data[idx] return x_data[:num], y_data[:num]
def reload_mimic_seq(train_percent=MIMICP.train_percent, valid=False, file_path=MIMICP.mimic_data_path, seq_num=MIMICP.seq_num): train_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_train_x_seq.npz') train_x = train_x.reshape((train_x.shape[0], seq_num, -1)) valid_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_valid_x_seq.npz') valid_x = valid_x.reshape((valid_x.shape[0], seq_num, -1)) test_x = CsvUtility.read_sparse_array_from_csv(file_path, 'sparse_formal_test_x_seq.npz') test_x = test_x.reshape((test_x.shape[0], seq_num, -1)) train_y = CsvUtility.read_array_from_csv(file_path, 'formal_train_y_seq.csv') valid_y = CsvUtility.read_array_from_csv(file_path, 'formal_valid_y_seq.csv') test_y = CsvUtility.read_array_from_csv(file_path, 'formal_test_y_seq.csv') if valid: test_x = valid_x test_y = valid_y else: train_x = np.concatenate((train_x, valid_x), axis=0) train_y = np.concatenate((train_y, valid_y), axis=0) if train_percent < 0.8: new_training_size = int((train_x.shape[0] + test_x.shape[0]) * train_percent) train_x = train_x[:new_training_size] train_y = train_y[:new_training_size] return train_x, train_y, test_x, test_y
def save_phi_alpha_theta_topicdistrib(self): plsa = "" percent = "" if self.PLSA: plsa = "PLSA" if self.corpus_percent != 1.0: percent = "_" + str(self.corpus_percent) + "percent" if self.mimic_movie_wiki == 0: CsvUtility.write_array2csv( self.get_alpha(), self.output_path, 'MIMIC_alpha_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_mimic_phi(MIMICP.feature_index_file), self.output_path, 'MIMIC_phi_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_theta(), self.output_path, 'MIMIC_theta_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_topic_distrib_of_word(), self.output_path, 'MIMIC_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv') elif self.mimic_movie_wiki == 1: CsvUtility.write_array2csv( self.get_alpha(), self.output_path, 'MovieReview_alpha_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_movie_phi(MOVIEP.feature_index_file), self.output_path, 'MovieReview_phi_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_theta(), self.output_path, 'MovieReview_theta_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_topic_distrib_of_word(), self.output_path, 'MovieReview_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv') else: CsvUtility.write_array2csv( self.get_alpha(), self.output_path, 'Wiki_alpha_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_movie_phi(MOVIEP.feature_index_file), self.output_path, 'Wiki_phi_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_theta(), self.output_path, 'Wiki_theta_' + str(self.topic_num) + plsa + percent + '.csv') CsvUtility.write_array2csv( self.get_topic_distrib_of_word(), self.output_path, 'Wiki_topic_distrib_' + str(self.topic_num) + plsa + percent + '.csv')
def get_mimic_sequence_data(data_pickle_path, word_dict_path, predict_dict_path, seq_max, vec_len, sgns_path, save_path, save=False): all_events = CsvUtility.read_pickle(data_pickle_path, 'r') word_dict = CsvUtility.read_pickle(word_dict_path, 'r') predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r') # pprint(all_events[0]) print "word_dict:", len(word_dict), "predict_dict:", len(predict_dict), "all_events:", len(all_events) feature_dict = WordIndexMap(list(word_dict)) pred_dict = WordIndexMap(list(predict_dict)) filter_event = __filter_events(all_events=all_events) sgns_model = get_sgns_embedding('MIMIC', sgns_path) feature_tensor = np.zeros((len(filter_event), seq_max, vec_len)) feature_count_tensor = np.zeros((len(filter_event), seq_max)) result_tensor = np.zeros((len(filter_event), len(predict_dict))) find_nan = {} for i_iter, event_line in enumerate(filter_event): for seq_iter, sequence_item in enumerate(event_line[0]): for event_code in sequence_item: if event_code in sgns_model: feature_tensor[i_iter][seq_iter] += sgns_model[event_code] feature_count_tensor[i_iter][seq_iter] += 1 else: if event_code in find_nan: find_nan[event_code] += 1 else: find_nan[event_code] = 1 for pred_item in event_line[1]: result_tensor[i_iter][pred_dict.get_index_by_word(pred_item)] = 1 if i_iter % 1000 == 0: print 'complete {0} of {1}'.format(i_iter, len(filter_event)) print 'words not in docs:', len(find_nan) if save: CsvUtility.write_dict2csv(feature_dict.get_word2index(), save_path, 'feature2index_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_dict2csv(pred_dict.get_word2index(), save_path, 'predict2index_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_array2csv(feature_tensor.reshape((feature_tensor.shape[0], -1)), save_path, 'feature_matrix_seq_embedding'+str(vec_len)+'.csv') CsvUtility.write_array2csv(result_tensor.reshape((result_tensor.shape[0], -1)), save_path, 'result_matrix_seq_embedding'+str(vec_len)+'.csv') return feature_tensor, feature_count_tensor, result_tensor
# filter_contend = {} # filter_index = 0 # for i in res: # for a in i.get(): # filter_contend[str(filter_index)] = ' '.join(a) # filter_index += 1 # CsvUtility.write_dict2csv(filter_contend, sentence_dir, 'selected_movie_review_docs4LDA.csv') def get_filter_data(path): get_con = [] for process_index in range(1): for file_iter in range(6): name = str(process_index) + "process_" + str(file_iter + 1) + "00iter_text.csv" content = CsvUtility.read_norm_array_csv(path, name) # print(len(content)) get_con.extend(content) print(" content number : ", len(get_con)) return get_con[:100000] # print(content[0]) if __name__ == '__main__': # _load_and_process_metadata("/home1/yk/wikipedia_dataset/text", "/home1/yk/Movie_Review_data", num_processor=20) contend = get_filter_data("/home1/yk/wikipedia_dataset/filter") name = "wiki_text.csv" CsvUtility.write_norm_array2csv(contend, "/home1/yk/wikipedia_dataset/filter", name) pass
def generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_valid_perc=0.8, shuffle=False, save=False): feature_tensor, feature_count_tensor, result_tensor = get_mimic_sequence_data(data_pickle_path=base_path+'after_instance_seq.pkl', word_dict_path=base_path+'event_instance_dict_seq.pkl', predict_dict_path=base_path+'predict_diags_dict_seq.pkl', seq_max=seq_max, vec_len=vec_len, sgns_path=sgns_path, save_path=save_path, save=False) feature_tensor = __get_aggregate_seq(feature_tensor, feature_count_tensor, seq_not) x = feature_tensor.reshape((feature_tensor.shape[0], -1)) y = result_tensor.reshape((result_tensor.shape[0], -1)) train_size = int(x.shape[0] * train_valid_perc) # for further extention name_append = 'SGNS' # shuffle the train set if shuffle: idx = np.random.permutation(x.shape[0]) CsvUtility.write_array2csv(idx, base_path, 'random_idx_seq_' + name_append + '.csv') else: idx = CsvUtility.read_array_from_csv(base_path, 'random_idx_seq_' + name_append + '.csv') x_train = x[idx] y_train = y[idx] training_x = x_train[:train_size] training_y = y_train[:train_size] testing_x = x_train[train_size:] testing_y = y_train[train_size:] # print training_x.shape # print training_y.shape # print testing_x.shape # print testing_y.shape # print len(idx) if save: CsvUtility.write_array2csv(training_x, save_path, 'formal_train_valid_x_seq_'+name_append+'.csv') CsvUtility.write_array2csv(training_y, save_path, 'formal_train_valid_y_seq_'+name_append+'.csv') CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_'+name_append+'.csv') CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_'+name_append+'.csv') return training_x, training_y, testing_x, testing_y
def run(): base_path = "/home1/yk/experiments_TKDE/major_revision/" # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl" # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv" # time_prefix_list = ['#2020-11-29 15_28_25#', '#2020-11-29 15_26_23#', '#2020-11-29 15_26_31#', '#2020-11-29 15_28_09#', '#2020-11-29 15_30_14#'] # topic_list = ['20', '50', '100', '200', '500'] time_prefix_list = [ '#2020-11-30 22_36_35#', '#2020-11-30 22_36_56#', '#2020-11-30 22_03_35#', '#2020-11-30 22_04_03#', '#2020-11-30 22_09_12#' ] topic_list = [20, 50, 100, 200, 500] model_file_name = '_lstm_model_mimic_mlp_ldareg_1layer_0.8_topic' sita_file_name = '_sita_mimic_mlp_ldareg_1layer_0.8_topic' entropy_matrix = [] F1_matrix = [] # common_label_ratio_matrix = [] neuron_num = 10 mean_f1 = 0.0 while mean_f1 <= 0.599: for i in [2]: model_file = time_prefix_list[i] + model_file_name + str( topic_list[i]) + '.pkl' sita_file = time_prefix_list[i] + sita_file_name + str( topic_list[i]) + '.csv' # model_file = "#2020-11-21 03_33_04#_lstm_model_mimic_mlp_ldareg_1layer_0.8.pkl" # sita_file = "#2020-11-21 03_33_04#_sita_mimic_mlp_ldareg_1layer_0.8.csv" r1, r2 = get_neron_labels(base_path=base_path, model_file=model_file, top_n_label=2, patient_num=100000, neuron_num=neuron_num, topic_num=topic_list[i]) while len(r1) < 128: print("not cover all neurons !") neuron_num += 2 r1, r2 = get_neron_labels(base_path=base_path, model_file=model_file, top_n_label=2, patient_num=100000, neuron_num=neuron_num, topic_num=topic_list[i]) entropy_cluster = [topic_list[i]] f1_cluster = [topic_list[i]] common_label_ratio = [topic_list[i]] for tn in [2, 5, 10, 20, 30]: label_cluster_matrix, cluster_re = cluster_neurons( neuron_labels=r1, base_path=base_path, sita_file=sita_file, cluster_num=tn) cce = cal_class_entropy( label_cluster_matrix=label_cluster_matrix, neuron_num=128) cf = cal_F1(label_cluster_matrix=label_cluster_matrix, neuron_num=128, cluster_re=cluster_re) entropy_cluster.append(cce) f1_cluster.append(cf) mean_f1 += cf entropy_matrix.append(entropy_cluster) mean_f1 /= 5.0 F1_matrix.append(f1_cluster) # common_count, common_ratio = cal_common_label_ratio(neuron_labels=r1, base_path=base_path, sita_file=sita_file) # common_label_ratio.append(common_count) # common_label_ratio.append(common_ratio) # common_label_ratio_matrix.append(common_label_ratio) time_code = '#' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '#_' CsvUtility.write_array2csv( entropy_matrix, base_path, time_code + "class_entropy_pa100000_topneu10_toplab2_tuneTopic100.csv") CsvUtility.write_array2csv( F1_matrix, base_path, time_code + "f1_pa100000_topneu10_toplab2_tuneTopic100.csv")
def get_train_validation_test_seq(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc=0.8, shuffle=False, save=False): training_x, training_y, testing_x, testing_y = generate_train_test(base_path, seq_max, vec_len, sgns_path, save_path, seq_not, train_perc, shuffle, save) training_size = int(training_x.shape[0] * 0.8) formal_training_x = training_x[:training_size] formal_training_y = training_y[:training_size] validation_x = training_x[training_size:] validation_y = training_y[training_size:] print formal_training_x.shape print formal_training_y.shape print validation_x.shape print validation_y.shape print testing_x.shape print testing_y.shape # for further extention embedding_append = 'lda_sgns500_window50' CsvUtility.write_array2csv(formal_training_x, save_path, 'formal_train_x_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(formal_training_y, save_path, 'formal_train_y_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(validation_x, save_path, 'formal_valid_x_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(validation_y, save_path, 'formal_valid_y_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(testing_x, save_path, 'formal_test_x_seq_' + embedding_append + '.csv') CsvUtility.write_array2csv(testing_y, save_path, 'formal_test_y_seq_' + embedding_append + '.csv') return training_x, training_y, validation_x, validation_y, testing_x, testing_y