def train_net(): output, X, YY, keep_prob = model.model() def _onehot(lables):#one-hot编码 return tf.one_hot(lables, depth=26, on_value=1.0, axis=2) Y = _onehot(YY) print(Y) #损失定义 loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.reshape(Y, [-1, 26 * 4]), logits= output)) # optimizer 选择 optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) # optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) predict = tf.reshape(output, [-1, 4, 26]) max_idx_p = tf.argmax(predict, 2) max_idx_l = tf.argmax(tf.reshape(Y, [-1, 4, 26]), 2) correct_pred = tf.equal(max_idx_p, max_idx_l) accuracy = tf.reduce_mean(tf.reduce_min(tf.cast(correct_pred, tf.float32), axis=1)) # 读取数据 images, labels = preprocess.read_data(['deal/0.txt', 'deal/1.txt', 'deal/2.txt']) total, width = labels.shape image_test, label_test = preprocess.read_data(['deal/test.txt']) tf.summary.scalar("loss", loss) tf.summary.scalar("accuracy", accuracy) merged = tf.summary.merge_all() saver = tf.train.Saver() #训练过程 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #训练信息写入tensorboard filewriter = tf.summary.FileWriter("logs/", graph=sess.graph) step = 0 if os.listdir('./check_point'): ckpt = tf.train.latest_checkpoint('./check_point') # print(ckpt) # ckpt = './check_point/weight-3980' saver.restore(sess, ckpt) print('restore from the checkpoint: {0}'.format(ckpt)) # images, labels = preprocess.read_data(['deal/0.txt', 'deal/1.txt', 'deal/2.txt']) while True: for i in range(int(total / batch_size)): start_index = (i * batch_size) % total image_batch = images[start_index: start_index + batch_size] label_batch = labels[start_index: start_index + batch_size] summary, _, loss_ = sess.run([merged ,optimizer, loss], feed_dict={X: image_batch, YY: label_batch, keep_prob: 0.75}) print(step, 'loss: %f' %loss_) filewriter.add_summary(summary, step) step += 1 if step % 10 == 0: acc = sess.run(accuracy, feed_dict={X: image_test, YY: label_test, keep_prob: 1.0}) print('第%d步,在测试集上的准确率为 %.2f'%(step, acc)) if acc > 0.4: saver.save(sess,'./check_point/weight', global_step= step)
def dt(): start_time = time.time() data_frame, data_discrete_info, data_continuous_info = preprocess.read_data(train_filename, discrete_keys, continuous_keys) test_frame, _, __ = preprocess.read_data(test_filename, discrete_keys, continuous_keys) # attributes = discrete_keys + continuous_keys tree = decision_tree.DecisionTree(data_frame, discrete_keys + continuous_keys, data_discrete_info, data_continuous_info, 'y') tree.build() # tree.show_tree() error_rate = tree.inference(test_frame) end_time = time.time() print("Time cost:", end_time - start_time) return error_rate
def train_gmm(): feature, label = read_data(config.DATA_DIR) feature, label = feature.reshape(feature.shape[0], -1).numpy(), label.numpy() train_feature, test_feature, train_label, test_label = train_test_split( feature, label) model_0 = GaussianMixture(n_components=3, max_iter=100, weights_init=[1 / 3, 1 / 3, 1 / 3], random_state=42) model_1 = GaussianMixture(n_components=3, max_iter=100, weights_init=[1 / 3, 1 / 3, 1 / 3], random_state=42) # model.means_init = numpy.array([train_feture[train_label == i].mean(axis=0) # for i in range(2)]) model_0.fit(train_feature[train_label == 0], train_label[train_label == 0]) model_1.fit(train_feature[train_label == 1], train_label[train_label == 1]) # pred = model.predict(test_feature) # for feat in test_feature: y_pred = [] score_0 = model_0.score_samples(test_feature) score_1 = model_1.score_samples(test_feature) for i in range(len(score_1)): if score_0[i] > score_1[i]: y_pred.append(0) else: y_pred.append(1) # print(model_0.score_samples(test_feature), model_1.score_samples(test_feature)) print(accuracy_score(test_label, y_pred)) # recall and precision matrix = classification_report(test_label, y_pred) print("Classification report: \n", matrix) # Plot non-normalized confusion matrix np.set_printoptions(precision=2) con_matrix = confusion_matrix(test_label, y_pred) class_names = ["Non_cough", "Cough"] plt.figure() plot_confusion_matrix(con_matrix, classes=class_names, title='Confusion matrix, without normalization') # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(con_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.show() pickle.dump(model_0, open("resource/non_cough.pkl", 'wb')) pickle.dump(model_1, open("resource/cough.pkl", 'wb')) print(model_1.weights_) print(model_0.weights_)
def get_train_data(): tmp1, tmp2 = pp.read_data() S,A = pp.preprocess_data(tmp1, tmp2) S_train, A_train, _, _ = split_data(S,A) print("train size: ", len(S_train)) #save_trainsplit_data(S_train, A_train) return S_train, A_train
def predict(model, message): """Return prediction: float Returns closing stock price for given row """ train_df = read_data(TRAIN_DATA) dataset_test = pd.DataFrame(message, index=[0]) dataset_test.columns = train_df.columns scaler = MinMaxScaler(feature_range=(0, 1)) df = train_df.tail(300) # print('-----------------', df.tail(1)) df_close = df['close'] # print(df_close.head()) # print(df_close.shape) df_close = scaler.fit_transform(np.array(df_close).reshape(-1, 1)) X_test = np.array(df_close[len(df_close) - 300:, 0]) temp_list = np.array(X_test).reshape(1, 300) # print(temp_list.shape) # # print(X_test.shape) # print('\n') # print('----------') X_test = X_test.reshape(temp_list.shape[0], temp_list.shape[1], 1) predicted_stock_price = model.predict(X_test) predicted_stock_price = scaler.inverse_transform(predicted_stock_price) train_df = train_df.append(dataset_test, ignore_index=True) train_df.to_csv(TRAIN_DATA, index=False) return predicted_stock_price
def generator(wav_name_list, batch_size, sample_rate, peak_norm, voc_mode, bits, mu_law, wave_path, voc_pad, hop_length, voc_seq_len, preemphasis, n_fft, n_mels, win_length, max_db, ref_db, top_db): # generator只能进行一次生成,故需要while True来进行多个epoch的数据生成 while True: # 每epoch将所有数据进行一次shuffle #order = np.random.choice(len(wav_name_list), len(wav_name_list), replace=False) #audio_data_path_list = [wav_name_list[i] for i in order] audio_data_path_list = wav_name_list batchs = len(wav_name_list) // batch_size for idx in range(batchs): #逐步取音频名 wav_name_list2 = audio_data_path_list[idx * batch_size:(idx + 1) * batch_size] #取音频数据 input_mel, input_sig = read_data(wave_path, sample_rate, peak_norm, voc_mode, bits, mu_law, wav_name_list2, preemphasis, n_fft, n_mels, hop_length, win_length, max_db, ref_db, top_db) dataset = collate_vocoder(input_mel, input_sig, voc_seq_len, hop_length, voc_pad, voc_mode, bits) # input_mel = tf.convert_to_tensor(input_mel[0]) # input_sig = tf.convert_to_tensor(input_sig[0]) yield dataset
def main(): print('Restoring map...') enc_map = cPickle.load(open(encode_map, 'rb')) dec_map = cPickle.load(open(decode_map, 'rb')) vocab_size = len(dec_map) print('Bulid Dataset...') lines = read_data(predict_file) question_list = parse_input_data_list(lines, enc_map, 50, False) print('Bulid Model...') model = MemNet(vocab_size = vocab_size, embed_size = 512, n_hop = 6, memory_size = 20, sentence_size = 50, option_size = 10) print('Bulid Solver...') solver = Solver(model, enc_map, dec_map, eval_batch_size = 1, test_record_path = './record/test/', test_examples = 10000, restore_path = './checkpoint/', print_step = 5) answer = solver.predict(question_list) idx = [x for x in range(1, len(question_list)+1)] import pandas as pd df = pd.DataFrame(data={'answer':answer}) df.index += 1 df.to_csv('predict.csv', index=True, index_label='id')
def train_svm(): feature, label = read_data(config.DATA_DIR) train_feature, test_feature, train_label, test_label = train_test_split( feature, label) train_feature = train_feature.reshape(train_feature.shape[0], -1) test_feature = test_feature.reshape(test_feature.shape[0], -1) svm = SVC() svm.fit(train_feature.numpy(), train_label.numpy()) y_pred = svm.predict(test_feature.numpy()) #pickle.dump(svm, open("resource/gmm.pkl", 'wb')) print(accuracy_score(test_label.numpy(), y_pred)) #print(confusion_matrix(test_label.numpy(), y_pred)) print(y_pred) np.set_printoptions(precision=2) matrix = classification_report(test_label.numpy(), y_pred) print("Classification report: \n", matrix) # Plot non-normalized confusion matrix con_matrix = confusion_matrix(test_label.numpy(), y_pred) class_names = ["Non_cough", "Cough"] plt.figure() plot_confusion_matrix(con_matrix, classes=class_names, title='Confusion matrix, without normalization') # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(con_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.show()
def test_read_data(self): df = read_data(self.data_dir, self.bodies_file, self.stances_file) self.assert_valid_df(df) self.assertIsInstance(df.iloc[0]['text_a'], str) self.assertIsInstance(df.iloc[0]['text_b'], str) self.assertIn(df.iloc[0]['labels'], CLASSES.keys())
def get_test_data(): tmp1, tmp2 = pp.read_data() S,A = pp.preprocess_data(tmp1, tmp2) _,_,S_test, A_test = split_data(S,A) print("test size: ", len(S_test)) #save_testsplit_data(S_test, A_test) return S_test, A_test
def test_split_data(self): df = read_data(self.data_dir, self.bodies_file, self.stances_file) train_data, dev_data, test_data = split_data(df) for data in train_data, dev_data, test_data: self.assert_valid_df(data) self.assertLess(len(train_data), 0.9 * len(df)) self.assertLess(len(dev_data), 0.1 * len(df)) self.assertAlmostEqual(len(test_data), 0.1 * len(df), delta=100)
def train(args): opt = json.load(open('models/config.json', 'r'))['rnet'] config = tf.ConfigProto() config.gpu_options.allow_growth = True print('Reading data') dp = preprocess.read_data('train', opt) sess = tf.Session(config=config) it, enqueue_op = dp.provide(sess) rnet_model = model.RNet(opt) loss, pt, accu = rnet_model.build_model(it) avg_loss = tf.reduce_mean(loss) train_op = tf.train.AdadeltaOptimizer(1.0, rho=0.95, epsilon=1e-06).minimize(loss) # saving model saver = tf.train.Saver() startTime = time.time() with sess.as_default(): sess.run(tf.global_variables_initializer()) # start feeding threads coord = tf.train.Coordinator() threads = [] for i in range(opt['num_threads']): t = Thread(target=feeder, args=(dp, sess, enqueue_op, coord, i, args.debug)) t.start() threads.append(t) # start training for i in range(args.epochs): print('Training...{}th epoch'.format(i)) training_time = int(dp.num_sample/dp.batch_size) for j in tqdm(range(training_time)): _, avg_loss_val, pt_val = sess.run([train_op, avg_loss, pt]) if j % 100 == 0: print('iter:{} - average loss:{}'.format(j, avg_loss_val)) print('saving rnet_model{}.ckpt'.format(i)) save_path = saver.save(sess, os.path.join(args.save_dir, 'rnet_model{}.ckpt'.format(i))) cancel_op = dp.q.close(cancel_pending_enqueues=True) sess.run(cancel_op) print('stopping feeders') coord.request_stop() coord.join(threads, ignore_live_threads=True) save_path = saver.save(sess, os.path.join(args.save_dir, 'rnet_model_final.ckpt')) sess.close() print('Training finished, took {} seconds'.format(time.time() - startTime))
def evaluate(args): opt = json.load(open('models/config.json', 'r'))['rnet'] config = tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1) config.gpu_options.allow_growth = True sess = tf.Session(config=config) saved_model = args.model_path EM = 0.0 F1 = 0.0 with sess.as_default(): print('Reading data') dp = preprocess.read_data('dev', opt) it, enqueue_op = dp.provide(sess) rnet_model = model.RNet(opt) loss, pt, accu = rnet_model.build_model(it) dequeued_p, asi, aei = it['p'], it['asi'], it['aei'] # restore model print('restoring model...') saver = tf.train.Saver() saver.restore(sess, saved_model) # start feeding threads coord = tf.train.Coordinator() threads = [] for i in range(opt['num_threads']): t = Thread(target=feeder, args=(dp, sess, enqueue_op, coord, i, args.debug)) t.start() threads.append(t) # start prediction print('Prediction starts') num_batch = int(dp.num_sample/dp.batch_size) for j in tqdm(range(num_batch)): pt_val, p_batch, asi_batch, aei_batch = sess.run([pt, dequeued_p, asi, aei]) f1, em = 0.0, 0.0 for k in range(len(p_batch)): paragraph = p_batch[k][0].decode('utf8').split(' ') true_start, true_end = asi_batch[k][0], aei_batch[k][0] pred_start, pred_end = pt_val[k][0], pt_val[k][1] pred_tokens = paragraph[pred_start:(pred_end+1)] true_tokens = paragraph[true_start:(true_end+1)] f1 += f1_score(' '.join(pred_tokens), ' '.join(true_tokens)) em += exact_match_score(' '.join(pred_tokens), ' '.join(true_tokens)) print('{}th batch | f1: {} | em: {}'.format(j, f1/len(p_batch), em/len(p_batch))) F1 += f1 EM += em print('Evaluation complete, F1 score: {}, EM score: {}'.format(F1/dp.num_sample, EM/dp.num_sample))
def mlp_data(): # 将数据预处理成可以被mlp使用的数据形式 # 打乱数据 preprocess.cross_validation("train_set.csv", "train.csv", "test.csv", 0.1) # 读取数据 train_data, discrete_values, continous_values = preprocess.read_data("train_set.csv", discrete_keys, continuous_keys) # 将离散数据转换成连续数据 for key in discrete_values.keys(): idx = 0 for val in discrete_values[key]: train_data = train_data.replace(to_replace=val, value=idx) train_features = train_data[list(discrete_keys + continuous_keys)] train_ground_truth = train_data['y'] return train_features, train_ground_truth
def bayes_data(test_percentage=0.1): # 将数据处理为朴素贝叶斯可以处理的数据 data, discrete_infos, continuous_infos = preprocess.read_data('train_set.csv', discrete_keys, continuous_keys) # 将数据进行打乱 data_len = len(data) test_data_count = test_percentage * data_len data = shuffle(data) ground_truth = data['y'] features = data[list(discrete_keys + continuous_keys)] # 将数据处理为数值数据,并且通过归一化将数据转换为非负的 for discrete_key in discrete_keys: idx = 0 for val in discrete_infos[discrete_key]: features = features.replace(to_replace=val, value=idx) idx += 1 features = (features - features.min()) / (features.max() - features.min()) return features, ground_truth
def train_lstm(): device = torch.dtest.pyevice( 'cuda' if torch.cuda.is_available() else 'cpu') feature, label = read_data(config.DATA_DIR) train_feature, test_feature, train_label, test_label = train_test_split( feature, label) train_data = MyDataset(train_feature, train_label) test_data = MyDataset(train_feature, train_label) train_data = DataLoader(train_data, batch_size=16, shuffle=True) test_data = DataLoader(test_data, batch_size=16, shuffle=False) model = LSTMClassifier().to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = Adam(params=model.parameters(), lr=0.001) for e in range(20): train_loss, train_acc = train(train_data, model, criterion, optimizer, device) test_loss, test_acc = test(test_data, model, criterion, device) print(train_loss, train_acc) print(test_loss, test_acc) print("----------------------------------------") torch.save(model, os.path.join(config.MODEL_DIR, "model_{}.pt".format(e + 1)))
def accuracy(true_labels, predictions): true_pred = 0 for i in range(len(predictions)): if np.argmax(predictions[i]) == np.argmax( true_labels[i]): # if 1 is in same index with ground truth true_pred += 1 return true_pred / len(predictions) if __name__ == "__main__": #PROCESS THE DATA words, labels = read_data(path) sentences = create_samples(words, labels) train_x, train_y, test_x, test_y = split_data(sentences) # creating one-hot vector notation of labels. (Labels are given numeric) # [0 1] is PERSON # [1 0] is not PERSON new_train_y = np.zeros(shape=(len(train_y), output_size)) new_test_y = np.zeros(shape=(len(test_y), output_size)) for i in range(len(train_y)): new_train_y[i][int(train_y[i])] = 1 for i in range(len(test_y)): new_test_y[i][int(test_y[i])] = 1
""" Created on Wed Jun 24 22:46:47 2020 @author: ISIL """ from preprocess import read_data, read_TFIDF, create_vectors, get_vectors, split_data, convert_list_to_nd_array from TFIDF import compute_TFIDF from logistic_regression import logistic_reg from SVM import support_vector_machine path = "./data" if __name__ == "__main__": #PROCESS THE DATA sentences, labels = read_data(path) TFIDF = compute_TFIDF(sentences, path) #calculate TFIDF values word_list, TFIDF = read_TFIDF( path) #read unique words list and TFIDF values create_vectors(word_list, TFIDF, path) #vectorize data NROWS = len(TFIDF) NCOLS = len(word_list) #vectorized corpus (premise+hypothesis) data = get_vectors(path, NROWS, NCOLS) #get vectorized data y = convert_list_to_nd_array("y", labels) train_x, train_y, test_x, test_y = split_data( data, y, path) #split %80 for training %20 for test
dec_x_lens) elif model.mode == "predict": _, loss = model.predict_step(enc_x, dec_x, dec_y, enc_x_lens, dec_x_lens) losses.append(loss) if (batch + 1) % 100 == 0: print("[{}] batch={:04d}, loss={:.4f}".format( datetime.datetime.now(), batch + 1, loss)) avg_loss = np.mean(losses) return avg_loss if __name__ == "__main__": time_start = time.time() vocabulary, vocabulary_reverse = load_vocab(args.data_path) train_examples, _ = read_data(args.train_file, args.max_utterance_len, args.max_example_len + 1) eval_examples, _ = read_data(args.eval_file, args.max_utterance_len, args.max_example_len + 1) if not os.path.exists(args.root_path): os.makedirs(args.root_path) tf.reset_default_graph() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=None): train_model = SEQ2SEQ(session, options, "train") with tf.variable_scope("Model", reuse=True): eval_model = SEQ2SEQ(session, options, "predict")
c_range = [round(0.1 * a, 1) for a in range(1, 10)] + [1] + list(range(10, 151, 10)) kernels = ['linear', 'rbf', 'sigmoid'] ###poly太花时间了,暂且去掉 ############################################################################################ ########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^################################# ######################## take line-trans as feature ################################# ########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^################################# ############################################################################################ print('take LINE-TRANS as feature') wiki_src = '/home/liruihan/Desktop/data/wiki_entropy/wiki_line_trans/' lyric_src = '/home/liruihan/Desktop/data/uni_entropy_trans/' wa_src = '/home/liruihan/Desktop/data/wa_entropy/wa_line_trans/' cm_src = '/home/liruihan/Desktop/data/cm_entropy/cm_line_trans/' aozora_src = '/home/liruihan/Desktop/data/aozora_entropy/aozora_line_trans/' wiki_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wiki_src)] lyric_line_trans = [ e.split('\n')[:-1] for e in pre.read_data(lyric_src, 'num_') if e != '0\n' ] #wa_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wa_src)] #cm_line_trans = [e.split('\n')[:-1] for e in pre.read_data(cm_src)] aozora_line_trans = [e.split('\n')[:-1] for e in pre.read_data(aozora_src)] wiki_line_percentile_entropy = [] for trans in wiki_line_trans: wiki_line_percentile_entropy.append(percentile_entropy(trans)) ''' wa_line_percentile_entropy = [] for trans in wa_line_trans: wa_line_percentile_entropy.append(percentile_entropy(trans)) '''
def lsa(X, number_of_topics): reduced_u, reduced_sigma, reduced_v = reduce_dimension(X, number_of_topics) word_topic_matrix = np.dot(reduced_u, reduced_sigma) topic_doc_matrix = np.dot(reduced_sigma, reduced_v) app_X = np.dot(np.dot(reduced_u, reduced_sigma), reduced_v) return word_topic_matrix, topic_doc_matrix, app_X if __name__ == '__main__': files = glob.glob('./text/*.txt') documents = [] for f in files: documents.append(read_data(f)) documents, words = pre_process(documents) X = word_doc_matrix(words, documents) word_topic_matrix, topic_doc_matrix, app_X = lsa(X, 5) print(word_topic_matrix) print(topic_doc_matrix) print(app_X)
model["result"] = np.mean(correct) print(f"Accuracy {n+1}/{len(models)}:\t{model['result']}") return models def save_accuracies(models): for model in models: results_savename = f"{config.gs_results_name}{model['ID']:04d}" with open(config.gs_directory + results_savename, "w+") as outfile: outfile.write(str(model["result"])) if __name__ == "__main__": t0 = time() data = preprocess.read_data() data = preprocess.one_hot(data) data = preprocess.reshape_4D(data) params = {"kernel_size" : config.gs_kernel_size, "activation_hid" : config.gs_activation_hid, "activation_out" : config.gs_activation_out, "layers" : config.gs_layers, "layers_out" : [data["layers_out"]], "learning_rate" : config.gs_learning_rate, "epochs" : config.gs_epochs, "batch_size" : config.gs_batch_size} msg = "Requires cmdline arg 'load' or 'save'" if len(sys.argv) == 2: if sys.argv[1].lower() == "load":
def launch_model(): full_text = request.form['full_text'] id_ = request.form['id'] model_type = request.form['model_type'] global BERT, JOINT, GRANU, MGN, NUM_TASK, MASKING, HIER BERT = model_type == BERT_PATH JOINT = model_type == JOINT_BERT_PATH GRANU = model_type == GRANU_BERT_PATH MGN = model_type == MGN_SIGM_BERT_PATH # either of the four variants: # BERT = False # JOINT = False # GRANU = False # MGN = True assert BERT or JOINT or GRANU or MGN assert not (BERT and JOINT) and not (BERT and GRANU) and not (BERT and MGN) \ and not (JOINT and GRANU) and not (JOINT and MGN) and not (GRANU and MGN) # either of the two variants SIGMOID_ACTIVATION = True RELU_ACTIVATION = False assert not (SIGMOID_ACTIVATION and RELU_ACTIVATION) and ( SIGMOID_ACTIVATION or RELU_ACTIVATION) if BERT: NUM_TASK = 1 MASKING = 0 HIER = 0 elif JOINT: NUM_TASK = 2 MASKING = 0 HIER = 0 elif GRANU: NUM_TASK = 2 MASKING = 0 HIER = 1 elif MGN: NUM_TASK = 2 MASKING = 1 HIER = 0 else: raise ValueError( "You should choose one of bert, joint, granu and mgn in options") dct = { 'NUM_TASK': NUM_TASK, 'MASKING': MASKING, 'SIGMOID_ACTIVATION': SIGMOID_ACTIVATION, 'HIER': HIER } model = load_model(model_type, **dct) if not id_: ids = get_existent_ids() id_ = random_module.randint(0, N) while id_ in ids: id_ = random_module.randint(0, N) with open(DIRECTORY_PREDICT.joinpath(f'article{id_}.txt'), 'w', encoding='utf-8') as f: f.write(full_text) text = overwrite_one_article(id_, directory=DIRECTORY_PREDICT) my_predict_dataset = PropDataset(DIRECTORY_PREDICT, is_test=True) my_predict_iter = data.DataLoader(dataset=my_predict_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1, collate_fn=pad) tmp_file = 'tmp.txt' eval(model, my_predict_iter, tmp_file, criterion, binary_criterion, NUM_TASK=NUM_TASK) ids, texts = read_data(DIRECTORY_PREDICT, is_test=True) t_texts = clean_text(texts, ids) flat_texts = [sentence for article in t_texts for sentence in article] fi, prop_sents = convert(NUM_TASK - 1, flat_texts, tmp_file) prop_sents = prop_sents[id_] prop_sents = ['1' if elem else '' for elem in prop_sents] results = remove_duplicates(fi) DIRECTORY_PREDICT.joinpath(f'article{id_}.txt').rename( DIRECTORY_MARKUP.joinpath(f'article{id_}.txt')) lst = [set() for _ in range(len(full_text))] source_lst = [set() for _ in range(len(full_text))] for inner_lst in results: for i in range(inner_lst[-2], inner_lst[-1]): lst[i].add(HUMAN_READABLE_TECHNIQUES[TECHNIQUES.index( inner_lst[-3])]) source_lst[i].add(inner_lst[-3]) extracts_s_e = [] extracts = [] categories = [] for elem in fi: if elem[0] != str(id_): continue _, category, start, end = elem extracts_s_e.append((start, end)) extracts.append(text[start:end]) categories.append(category) extracts = [ ' '.join(normalize(extract.strip())) for extract in extracts if extract ] print(f'extracts: {extracts}') # CHECK # extracts = [word for sent in extracts for word in sent.split()] test_x, test_maxlen = get_data(extracts, vocab_size=args.vocab_size, maxlen=args.maxlen) test_x = sequence.pad_sequences(test_x, maxlen=max(train_maxlen, test_maxlen)) test_length = test_x.shape[0] splits = [] for i in range(1, test_length // args.batch_size): splits.append(args.batch_size * i) if test_length % args.batch_size: splits += [(test_length // args.batch_size) * args.batch_size] test_x = np.split(test_x, splits) with graph.as_default(): aspect_model = keras_load_model(os.path.join('flask_app', 'output', 'reviews', 'model_param'), custom_objects={ "Attention": Attention, "Average": Average, "WeightedSum": WeightedSum, "MaxMargin": MaxMargin, "WeightedAspectEmb": WeightedAspectEmb, "max_margin_loss": U.max_margin_loss }, compile=True) test_fn = K.function([ aspect_model.get_layer('sentence_input').input, K.learning_phase() ], [ aspect_model.get_layer('att_weights').output, aspect_model.get_layer('p_t').output ]) aspect_probs = [] for batch in tqdm(test_x): _, cur_aspect_probs = test_fn([batch, 0]) aspect_probs.append(cur_aspect_probs) aspect_probs = np.concatenate(aspect_probs) label_ids = np.argsort(aspect_probs, axis=1)[:, -5:] for i, labels in enumerate(label_ids): print( f'{extracts[i]}: {[aspects[label] for label in labels][::-1]}') correct_lst = ['; '.join(list(elem)) for elem in lst] commands = { extract: ([aspects[label] for label in label_ids[i]][::-1], []) for i, extract in enumerate(extracts) } write_existent_dict(id_, source_lst, directory=DIRECTORY_MARKUP) for f in glob.glob(f'{DIRECTORY_PREDICT}/*'): os.remove(f) return jsonify( result={ 'id': id_, 'list': correct_lst, 'text': text, 'prop_sents': prop_sents, 'commands': commands })
def create_target_mapping(json_file_names, word_file_name, labels_file_name, word2id, vocabulary): """ Create the target sequence mappings. In order to map the target sequence to the orginal sequence the tree ids from the json files are needed ot know which words were kept and which were removed. Args: json_file_names: A list of json files that contain the data. word_file_name: The name of the text file containing the words. labels_file_name: The text file with the correct sentence compressions. word2id: The mapping from a word to an integer. vocabulary: The word vocabulary. Returns: target_seq_id: A list of sentences, where each of the words is mapped to an integer. It has the same dimesionality of word_seq_id. A zero indicates that the word has been removed in the compression. """ target_seq_id = [] target_seq = read_file(labels_file_name, to_lower=True, replace_int=True) word_seq = read_file(word_file_name, to_lower=True, replace_int=True) sent_index = 0 # Iterate files for file_name in json_file_names: json_object = preprocess.read_data(file_name) # Iterate sentences in a file for sentence in json_object['sentences']: word_dict = preprocess.create_word_dict(sentence) # Skip ('ROOT ') entry at [0] sent_word_ids = list(word_dict.keys())[1:] compression_word_ids = get_compression_word_ids( sentence['compression_untransformed']) # Vocabulary only has preprocessed words not the original words preprocessed_sent = word_seq[sent_index] target_sent_id = [] # Append <bos> id target_sent_id.append(word2id[preprocessed_sent[0]]) word_index = 1 # Iterate words in a sentence # Preprocessed sentence may be shorter than original. # The preprocessed sentence contains <bos> and <eos> # Iterate until one word before <eos> which should be ('.'), # reduce length by 3 # Skip last entry ('.') and add until the end for word_id in sent_word_ids[:len(preprocessed_sent)-3]: word = preprocessed_sent[word_index] if word_id in compression_word_ids: if word in vocabulary: target_sent_id.append(word2id[word]) else: target_sent_id.append(len(vocabulary)+1) else: target_sent_id.append(0) word_index += 1 # Append ('.') id, its in the word ids but not in compressed ids # It may not be a ('.'), first check to make sure if preprocessed_sent[-2] in vocabulary: target_sent_id.append(word2id[preprocessed_sent[-2]]) else: target_sent_id.append(len(vocabulary)+1) # Append <eos> id target_sent_id.append(word2id[preprocessed_sent[-1]]) target_seq_id.append(target_sent_id) sent_index += 1 return target_seq_id
scores = np.zeros([1, 2]) for x in list(enumerate(grid)): for y in list(enumerate(x[1])): d = y[1] if len(d) == 0: continue print('(%d, %d)' % (x[0] * 2 - 90, y[0] * 2 - 180)) curmod = train_and_test(d[:, 2], d[:, 3]) if curmod: models[x[0]][y[0]] = curmod scores = np.append(scores, [[curmod[1], curmod[2]]], axis=0) scores = np.delete(scores, 0, axis=0) print(' Done') print('') print('used last %d days of data for training' % days) print('models trained: %d' % len(scores)) print('avg train set size per model: %f, min %d, max %d' % (np.mean(scores[:, 1]), np.min(scores[:, 1]), np.max(scores[:, 1]))) print('total R-squared median: %f' % np.median(scores[:, 0])) data = read_data() train(data, days) #data = data[data[:,0].argsort()] # sort by latitude #data = np.array_split(data, n_models, axis=0) #metadata = np.asarray([(a.min(axis=0)[0],a.max(axis=0)[0]) for a in data]) #data = data[data[:,2].argsort()] # sort by time
c_range = [round(0.1 * a,1) for a in range(1, 10)] + [1] + list(range(10, 151, 10)) kernels = ['linear','rbf', 'sigmoid']###poly太花时间了,暂且去掉 ############################################################################################ ########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^################################# ######################## take line-trans as feature ################################# ########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^################################# ############################################################################################ print('take LINE-TRANS as feature') wiki_src = '/home/liruihan/Desktop/data/wiki_entropy/wiki_line_trans/' lyric_src = '/home/liruihan/Desktop/data/uni_entropy_trans/' wa_src = '/home/liruihan/Desktop/data/wa_entropy/wa_line_trans/' cm_src = '/home/liruihan/Desktop/data/cm_entropy/cm_line_trans/' aozora_src = '/home/liruihan/Desktop/data/aozora_entropy/aozora_line_trans/' wiki_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wiki_src)] lyric_line_trans = [e.split('\n')[:-1] for e in pre.read_data(lyric_src,'num_')] wa_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wa_src)] cm_line_trans = [e.split('\n')[:-1] for e in pre.read_data(cm_src)] aozora_line_trans = [e.split('\n')[:-1] for e in pre.read_data(aozora_src)] wiki_line_percentile_entropy = [] for trans in wiki_line_trans: wiki_line_percentile_entropy.append(percentile_entropy(trans)) wa_line_percentile_entropy = [] for trans in wa_line_trans: wa_line_percentile_entropy.append(percentile_entropy(trans)) lyric_line_percentile_entropy = [] for trans in lyric_line_trans:
import pandas as pd import pickle from preprocess import read_data # To add: # - average length of tweets # - # of tweets containing emotions in language # - # of tweets containing emojis data = pickle.load(open('data/data_df.pickle', 'rb')) months = {2:'february', 3:'march', 4:'april', 5:'may', 6:'june', 7:'july'} f = open('data/short_overview.txt','w') orig_data = read_data('data/40wita') total_after_clean = 0 total_before_clean = 0 for month in months.keys(): tweets = len(data[data['month']==month]['cleaned_text']) total_after_clean += tweets orig_tweets = len(orig_data[orig_data['month']==month]['text']) total_before_clean += orig_tweets max_tweets = 0 min_tweets = 2000000 orig_days = len(sorted(orig_data[orig_data['month']==month].day.unique())) days = sorted(data[data['month']==month].day.unique()) for day in days: twts = len(data[(data['month']==month) & (data['day']==day)]['cleaned_text']) if twts > max_tweets:
return list(map(int, processed.keys())) def update_processed(k, v, tested_file): if not os.path.isfile(tested_file): current = {} else: with open(tested_file) as f: current = json.load(f) current[k] = v with open(tested_file, "w") as f: json.dump(current, f) if __name__ == "__main__": dataset, labels, filenames = read_data("data") # define some files that will be used as a support experiment = "experiment_name" # experiment name (unique Id) tested_file = f"tested_{experiment}.json" # this file will contain the results of the experiments confs_file = f"allconfs_{experiment}.pkl" # this file contains a list of all parameter configurations to be tested if not os.path.isfile(confs_file): # if there is no configuration file containing a list of all # parameters to be tried, a new one is generated skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # performing both k-fold cross-validation and hold out folds = [ ("kfold", v) for v in list(skf.split(dataset, labels)) ]+ \ [ ("tt", train_test_split(list(range(len(labels))), train_size=300, shuffle=False)) ]
c_range = [round(0.1 * a, 1) for a in range(1, 10)] + [1] + list(range(10, 151, 10)) kernels = ['linear', 'rbf', 'sigmoid'] ###poly太花时间了,暂且去掉 ############################################################################################ ########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^################################# ######################## take acc-line-trans as feature ################################# ########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^################################# ############################################################################################ wiki_acc_src = '/home/liruihan/Desktop/data/wiki_entropy/wiki_accumulated_line_trans/' lyric_acc_src = '/home/liruihan/Desktop/data/uni_accumulated_entropy_trans/' wa_acc_src = '/home/liruihan/Desktop/data/wa_entropy/wa_accumulated_line_trans/' cm_acc_src = '/home/liruihan/Desktop/data/cm_entropy_20190115/cm_accumulated_line_trans/' aozora_acc_src = '/home/liruihan/Desktop/data/aozora_entropy/aozora_accumulated_line_trans/' wiki_acc_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wiki_acc_src)] lyric_acc_line_trans = [ e.split('\n')[:-1] for e in pre.read_data(lyric_acc_src, 'num_') if e != '0\n' ] wa_acc_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wa_acc_src)] cm_acc_line_trans = [e.split('\n')[:-1] for e in pre.read_data(cm_acc_src)] aozora_acc_line_trans = [ e.split('\n')[:-1] for e in pre.read_data(aozora_acc_src) ] wiki_percentile_entropy = [] for trans in wiki_acc_line_trans: wiki_percentile_entropy.append(percentile_entropy(trans)) wa_percentile_entropy = []
import numpy as np from keras.models import Sequential from keras.layers import BatchNormalization, Convolution1D, Dropout, Flatten, Dense, Convolution2D from preprocess import read_data #Loading the data filename = "driving_log.csv" #Y_train is the angle of the camera X_train, y_train = read_data(filename, pre_process=True, flip=True, dropSmallValuesWithRate=50) #My model model = Sequential() print("The shape of the model is: " + str(X_train[0].shape)) def train_model(X_train, y_train): if len(X_train[0].shape) == 2: print("Using two dimensional network") model.add( BatchNormalization(input_shape=(X_train[0].shape[0], X_train[0].shape[1]))) model.add(Convolution1D(5, 5)) model.add(Convolution1D(5, 5)) model.add(Convolution1D(3, 3)) model.add(Convolution1D(3, 3)) else: