def test_model(): vocab, embeddings = data_helper.load_embeddings(config.get('data', 'embedding_file')) model = RNNModel(embeddings, num_classes=5) model.load(config.get('data', 'model_dir')) test_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'test.txt')) numeric_test_samples = data_helper.convert_to_numeric_samples(test_data, vocab, num_classes=5) model.eval(numeric_test_samples)
def train_model(): vocab, embeddings = data_helper.load_embeddings(config.get('data', 'embedding_file')) train_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'train.txt')) numeric_train_samples = data_helper.convert_to_numeric_samples(train_data, vocab, num_classes=5) model = RNNModel(embeddings, num_classes=5, model_config=config['model']) dev_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'dev.txt')) numeric_dev_samples = data_helper.convert_to_numeric_samples(dev_data, vocab, num_classes=5) eval_func = lambda: model.eval(numeric_dev_samples) model.train(numeric_train_samples, eval_func) model.save(config.get('data', 'model_dir'))
def load_and_train_abstract(): label2idx = json.load(open('../data/journal2idx.json', 'r')) label_num = len(label2idx) + 1 embedding_path = '../data/glove.42B.300d.50K.w2v.txt' embeddings, vocab, embedding_size = load_embeddings(embedding_path, 100000) X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label( '../data/dataset_abstract_stat_50.npy', embeddings, vocab) model = models.baseline_abstract_cnn_model(embeddings, len(vocab), embedding_size, label_num) train_and_val('../models/abstract_cnn_baseline', model, X_train, y_train, X_val, y_val)
def load_and_train_ref_abs(has_rank=False): label2idx = json.load(open('../data/journal2idx.json', 'r')) label_num = len(label2idx) + 1 embedding_path = '../data/glove.42B.300d.50K.w2v.txt' embeddings, vocab, embedding_size = load_embeddings(embedding_path, 100000) X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label( '../data/dataset_abstract_stat_50.npy', embeddings, vocab) label2idx = json.load(open('../data/journal2idx.json', 'r')) label_num = len(label2idx) + 1 journal2idx_all = json.load(open('../data/journal2idx_all.json', 'r')) X_train_ref, _, _, X_val_ref, _, y_val_r_ref, X_test_ref, _, _ = load_ref_chain_to_label( '../data/dataset_ref_chain_stat_50.npy') model = models.reference_abstract_model( embeddings, word_vocab_size=len(vocab), word_embedding_dim=embedding_size, ref_vocab_size=len(journal2idx_all), ref_embedding_dim=embedding_size, label_num=label_num, has_rank=has_rank) if has_rank: y_train = [np.array(y_train), np.array(y_train_r)] y_val = [np.array(y_val), np.array(y_val_r)] train_and_val( '../models/ref_abs_cnn_with_rank', model, [np.array(X_train), np.array(X_train_ref)], y_train, [np.array(X_val), np.array(X_val_ref)], y_val) else: train_and_val( '../models/ref_abs_cnn_baseline', model, [np.array(X_train), np.array(X_train_ref)], y_train, [np.array(X_val), np.array(X_val_ref)], y_val)
def train_cnn_rnn(input_file,training_config): epochs=10 # input_file = sys.argv[1] x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(input_file) # training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)] embedding_mat = np.array(embedding_mat, dtype = np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1, random_state=16) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=16) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes = y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters = params['num_filters'], embedding_size = params['embedding_dim'], l2_reg_lambda = params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run( [global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for epoch in range(epochs): for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step(x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format(path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step)) logging.critical('Training is complete, testing the best model on x_test and y_test') # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) acc, loss, num_test_correct, predictions = dev_step(x_test, y_test) from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score y_test=[np.argmax(y_t) for y_t in y_test] print(sorted(list(set(y_test)))) recall_l=recall_score(y_test,predictions,average=None) f1_score=f1_score(y_test,predictions,average=None) acc_score=accuracy_score(y_test,predictions) total_test_correct = int(num_test_correct) logging.critical('Recall on test set: '+str(recall_l)) logging.critical('Acc on test set: '+str(acc_score)) logging.critical('F1 on test set: '+str(f1_score)) logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test))) print(len(labels)) print(len(recall_l)) print(len(f1_score)) labels_=[labels[n] for n in sorted(list(set(y_test)))] logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test))) df_=pd.DataFrame();df_["labels"]=labels_;df_["recall"]=recall_l;df_["f1"]=f1_score;df_.to_csv("matrics.csv",index=False) # Save trained parameters and files since predict.py needs them #print (vocabulary) with open(trained_dir + 'words_index.json', 'w') as outfile: #jsObj = json.dumps(vocabulary) #outfile.write(jsObj) #outfile.close() json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def main(args): argc = len(args) data_dir = args[1] #train/reviews train_text = args[2] # 'paper' # paper, review, all model_name = args[3] #rnn cnn dan label = int(args[4]) (x_train, y_train, x_dev, y_dev, x_test, y_test),\ vocab, vocab_inv, label_scale, aspects = \ prepare_data( data_dir, max_vocab_size = 35000, max_len_paper = 1000, max_len_review = 200) # choose only given aspect as label among different aspects if label >= 0: aspects = [aspects[label]] print('Labels:', aspects) # extract only data of interest x_train,y_train,evaluate_mean_train,evaluate_major_train,mean_aspects_train,major_aspects_train = \ choose_label(x_train, y_train, size = label_scale, label=label) x_dev,y_dev,evaluate_mean_dev,evaluate_major_dev,_,_ = \ choose_label(x_dev, y_dev, size = label_scale, label=label) x_test,y_test,evaluate_mean_test,evaluate_major_test,_,_ = \ choose_label(x_test, y_test, size = label_scale, label=label) # get mean/major from train on test evaluate_mean = [] evaluate_major = [] for aid, y_aspect in enumerate(y_test.T): mean_aspect = mean_aspects_train[aid] major_aspect = major_aspects_train[aid] evaluate_mean_aspect = evaluate(y_aspect, [mean_aspect] * len(y_aspect)) evaluate_major_aspect = evaluate(y_aspect, [major_aspect] * len(y_aspect)) evaluate_mean.append(evaluate_mean_aspect) evaluate_major.append(evaluate_major_aspect) print('Majority (Test)') for mean, major, a in zip(evaluate_mean, evaluate_major, aspects): print('\t%15s\t%.4f\t%.4f' % (a, mean, major)) print('\t%15s\t%.4f\t%.4f' % ('TOTAL', np.average(evaluate_mean), np.average(evaluate_major))) # choose train text if train_text == 'paper': x_train = x_train[0] x_dev = x_dev[0] x_test = x_test[0] elif train_text == 'review': x_train = x_train[1] x_dev = x_dev[1] x_test = x_test[1] elif train_text == 'all': x_train = np.concatenate(x_train, axis=1) x_dev = np.concatenate(x_dev, axis=1) x_test = np.concatenate(x_test, axis=1) else: print('Wrong') sys.exit(1) max_len = x_train.shape[1] print('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) print('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) timestamp = str(int(time.time())) trained_dir = './trained_results/' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) model, config = models(model_name) config.seq_length = max_len config.vocab_size = len(vocab) config.num_classes = len(aspects) #load embedding or None embedding_mat = load_embeddings( vocab, load="/data/word2vec/glove.840B.300d.w2v.bin") #None # loading a model model = model(config, embedding=embedding_mat) def feed_data(x_batch, y_batch, keep_prob): feed_dict = { model.input_x: x_batch, model.input_y: y_batch, model.keep_prob: keep_prob } return feed_dict session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) if embedding_mat is not None: sess.run([model.embedding_init], feed_dict={model.embedding_placeholder: embedding_mat}) # Checkpoint files will be saved in this directory during training checkpoint_dir = './ckpts/' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') ############################## # Training starts here ############################## best_loss = np.inf best_at_step = 0 for epoch in range(config.num_epochs): train_batches = batch_iter(list(zip(x_train, y_train)), config.batch_size) train_losses = [] for train_batch in train_batches: x_train_batch, y_train_batch = list(zip(*train_batch)) feed_dict = feed_data(x_train_batch, y_train_batch, config.dropout_keep_prob) current_step, train_loss, _ = sess.run( [model.global_step, model.loss, model._train_op], feed_dict) train_losses.append(train_loss) if current_step % config.print_per_batch == 0: print('[%d/%d] %.4f' % (epoch, current_step, np.average(train_losses))) # evaluateuate the model with x_dev and y_dev if current_step % config.save_per_batch == 0: dev_batches = batch_iter(list(zip(x_dev, y_dev)), config.batch_size) dev_losses = [] aspect_all_ys = {i: [] for i in range(len(aspects))} aspect_all_ys_ = {i: [] for i in range(len(aspects))} for dev_batch in dev_batches: x_dev_batch, y_dev_batch = list(zip(*dev_batch)) feed_dict = feed_data(x_dev_batch, y_dev_batch, 1.0) dev_loss, dev_logit = sess.run([model.loss, model.logits], feed_dict) dev_losses.append(dev_loss) #import pdb; pdb.set_trace() dev_y = np.array([d for d in dev_batch[:, 1]]) for aid, (y, y_) in enumerate(zip(dev_y.T, dev_logit.T)): aspect_all_ys[aid].extend(list(y)) aspect_all_ys_[aid].extend(list(y_)) dev_aspect = [] for aid in range(len(aspects)): ys = aspect_all_ys[aid] ys_ = aspect_all_ys_[aid] dev_aspect.append(evaluate(ys, ys_)) #for a,r in zip(aspects, dev_aspect): # print '\t%20s\t%.4f'%(a,r) #print '\t%20s\t%.4f'%('TOTAL',np.average(dev_aspect)) print('[%d] dev loss: %.6f, acc: %.6f' % (current_step, np.average(dev_losses), np.average(dev_aspect))) # test test_batches = batch_iter(list(zip(x_test, y_test)), config.batch_size, shuffle=False) aspect_all_ys = {} #[[]] * len(aspects) aspect_all_ys_ = {} #[[]] * len(aspects) for i in range(len(aspects)): aspect_all_ys[i] = [] aspect_all_ys_[i] = [] for test_batch in test_batches: x_test_batch, y_test_batch = list(zip(*test_batch)) feed_dict = feed_data(x_test_batch, y_test_batch, 1.0) test_loss, test_logit = sess.run( [model.loss, model.logits], feed_dict) test_y = np.array([d for d in test_batch[:, 1]]) for aid, (y, y_) in enumerate(zip(test_y.T, test_logit.T)): aspect_all_ys[aid].extend(list(y)) aspect_all_ys_[aid].extend(list(y_)) test_aspect = [] for aid in range(len(aspects)): ys = aspect_all_ys[aid] ys_ = aspect_all_ys_[aid] test_aspect.append(evaluate(ys, ys_)) print('[%d] test loss: %.4f' % (current_step, np.average(test_aspect))) if np.average(dev_losses) <= best_loss: best_loss, best_at_step = np.average( dev_losses), current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) print('Best loss %.2f at step %d' % (best_loss, best_at_step)) #print 'Epoch done' print('Training is complete, testing the best model on x_test and y_test') print('Best epoch', best_at_step) saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = batch_iter(list(zip(x_test, y_test)), config.batch_size, shuffle=False) aspect_all_ys = {} #[[]] * len(aspects) aspect_all_ys_ = {} #[[]] * len(aspects) for i in range(len(aspects)): aspect_all_ys[i] = [] aspect_all_ys_[i] = [] for test_batch in test_batches: x_test_batch, y_test_batch = list(zip(*test_batch)) feed_dict = feed_data(x_test_batch, y_test_batch, 1.0) test_loss, test_logit = sess.run([model.loss, model.logits], feed_dict) test_y = np.array([d for d in test_batch[:, 1]]) for aid, (y, y_) in enumerate(zip(test_y.T, test_logit.T)): aspect_all_ys[aid].extend(list(y)) aspect_all_ys_[aid].extend(list(y_)) evaluate_aspect = [] for aid in range(len(aspects)): ys = aspect_all_ys[aid] ys_ = aspect_all_ys_[aid] evaluate_aspect.append(evaluate(ys, ys_)) for a, r in zip(aspects, evaluate_aspect): print('\t%20s\t%.4f' % (a, r)) print('\t%20s\t%.4f' % ('TOTAL', np.average(evaluate_aspect)))
def train_cnn_rnn(): input_file = sys.argv[1] x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( input_file) training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info("x_train: {}, x_dev: {}, x_test: {}".format( len(x_train), len(x_dev), len(x_test))) logging.info("y_train: {}, y_dev: {}, y_test: {}".format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = "./trained_results_" + timestamp + "/" if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params["non_static"], hidden_unit=params["hidden_unit"], max_pool_size=params["max_pool_size"], filter_sizes=map(int, params["filter_sizes"].split(",")), num_filters=params["num_filters"], embedding_size=params["embedding_dim"], l2_reg_lambda=params["l2_reg_lambda"], ) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = "./checkpoints_" + timestamp + "/" if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, "model") def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params["max_pool_size"]) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params["dropout_keep_prob"], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params["embedding_dim"], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params["embedding_dim"], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run( [ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions, ], feed_dict, ) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params["batch_size"], params["num_epochs"]) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params["evaluate_every"] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params["batch_size"], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info("Accuracy on dev set: {}".format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical("Saved model {} at step {}".format( path, best_at_step)) logging.critical("Best accuracy {} at step {}".format( best_accuracy, best_at_step)) logging.critical( "Training is complete, testing the best model on x_test and y_test" ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + "-" + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params["batch_size"], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical("Accuracy on test set: {}".format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + "words_index.json", "w") as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + "embeddings.pickle", "wb") as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + "labels.json", "w") as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params["sequence_length"] = x_train.shape[1] with open(trained_dir + "trained_parameters.json", "w") as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): x_, y_, x_test, y_test, vocabulary, vocabulary_inv, labels = data_helper.load_data( ) #x_, y_, vocabulary, vocabulary_inv, labels = data_helper.load_data_book() training_config = 'training_config.json' params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [] for i in range(len(vocabulary_inv)): embedding_mat.append(word_embeddings[vocabulary_inv[i]]) embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set # Split the train set into train set and dev set # IMDB style # x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1) # Book data style #x_, x_test, y_, y_test = train_test_split(x_, y_, test_size=0.1) x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=[ int(x) for x in params['filter_sizes'].split(",") ], num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) #optimizer = tf.train.MomentumOptimizer(0.1, 0.9) optimizer = tf.train.AdamOptimizer() grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn_rnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn_rnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } summaries, _, step, loss, accuracy = sess.run([ train_summary_op, train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) # print(accuracy) return accuracy def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } summaries, step, loss, accuracy, num_correct, predictions = sess.run( [ dev_summary_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) dev_summary_writer.add_summary(summaries, step) print("step {}, loss {:g}, acc {:g}".format( step, loss, accuracy)) return accuracy, predictions sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_dev_accuracy, best_at_step = 0, 0 best_test_accuracy = 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_acc = train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: print("Training Accuracy:", train_acc, end=' ') print("Evaluation:", end=' ') dev_acc, _ = dev_step(x_dev, y_dev) print("Test:", end=' ') test_acc_tmp, pred__ = dev_step(x_test, y_test) # with open('results/prediction' + str(current_step), 'bw') as f: # pickle.dump(pred__, f) if dev_acc > best_dev_accuracy: best_dev_accuracy = dev_acc best_test_accuracy = test_acc_tmp print('best dev accuracy is', best_dev_accuracy, 'the test is', best_test_accuracy) print( 'Training is complete, testing the best model on x_test and y_test' ) # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) # os.rename(path, trained_dir + 'best_model.ckpt') # os.rename(path + '.meta', trained_dir + 'best_model.meta') shutil.rmtree(checkpoint_dir) logging.critical('{} has been removed'.format(checkpoint_dir)) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn(): input_file = sys.argv[1] if os.path.exists('./data/x.p') and \ os.path.exists('./data/y.p') and \ os.path.exists('./data/vocabulary.p') and \ os.path.exists('./data/vocabulary_inv.p') and \ os.path.exists('./data/labels.p'): x_ = pickle.load(open("./data/x.p", "rb")) y_ = pickle.load(open("./data/y.p", "rb")) vocabulary = pickle.load(open("./data/vocabulary.p", "rb")) vocabulary_inv = pickle.load(open("./data/vocabulary_inv.p", "rb")) labels = pickle.load(open("./data/labels.p", "rb")) else: x_, y_, vocabulary, vocabulary_inv, _, labels = data_helper.load_data( input_file) training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a n dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary, dim=params['embedding_dim']) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(embedding_mat=embedding_mat, non_static=params['non_static'], sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocabulary), embedding_size=params['embedding_dim'], filter_sizes=list( map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) # optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam') grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def train_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'], } _, step, summaries, loss_, accuracy_ = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss_, accuracy_)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0, } step, summaries, loss_, accuracy_, predictions_ = sess.run([ global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions ], feed_dict) time_str = datetime.datetime.now().isoformat() print("evaluation on test set:") print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss_, accuracy_)) if writer: writer.add_summary(summaries, step) return accuracy_, loss_, predictions_ saver = tf.train.Saver(tf.global_variables()) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) # Train the model with x_train and y_train i = 0 for train_batch in train_batches: logging.info('Training on batch: {}'.format(i)) x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, predictions = dev_step( x_dev_batch, y_dev_batch, writer=dev_summary_writer) path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, i)) i += 1 logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(i)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) os.rename(path, trained_dir + 'best_model.ckpt') os.rename(path + '.meta', trained_dir + 'best_model.meta') shutil.rmtree(checkpoint_dir) logging.critical('{} has been removed'.format(checkpoint_dir)) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
if __name__ == "__main__": args = set_args() vocab_file = '../data/vocab.txt' train_file = '../data/LCQMC/LCQMC.train.data' valid_file = '../data/LCQMC/LCQMC.valid.data' embeddings_file = '../data/token_vec_300.bin' print('加载训练集ing...') train_data = LCQMC_Dataset(train_file, vocab_file, args.max_char_len) train_loader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) print('加载验证集ing...') dev_data = LCQMC_Dataset(valid_file, vocab_file, args.max_char_len) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=args.dev_batch_size) print('加载词向量ing...') embeddings = load_embeddings(embeddings_file) model = Model(embeddings) if torch.cuda.is_available(): model.cuda() train()
def train_cnn_rnn(): input_file = "logstashTemp.dat" output_file = "wcData85_1.csv" # with open(input_file,"r",encoding="utf8") as datFile: # jsonDict=json.loads(datFile.readline()) # with open(input_file,"r",encoding="utf8") as datFile: # jsonDf=pd.DataFrame([],columns=list(jsonDict.keys())) # rowNO=0 # for row in datFile.readlines(): # try: # jsonDf.loc[rowNO]=list(json.loads(row).values()) # except json.decoder.JSONDecodeError as ex: # print(ex.tostring) # rowNO+=1 # jsonDf.to_csv(output_file) print("loading data...") x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data3( output_file, ["crit", "err"], 10000) # print("y_:",y_) training_config = "training_config.json" params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' print(trained_dir) if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) #global_step will control the changes of grads_and_vars with # the change of itself which caused by optimizer.apply_gradients() optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3, decay=0.9) #initiate the optimizer whose learning_rate is firstly 1e-3 # but it will be decreased along with the change of decay in the folume below: # decayed_learning_rate = learning_rate*decay_rate^(global_step/decay_steps) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) #compute gradients of loss train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) #apply the gradients to variables and change them # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) print(step, "trainAccuracy", accuracy) with open("trainLogCsv.txt", "a+", encoding="utf8") as trainLogFile: trainLogFile.write("=========" + str(step) + "=========\n") trainLogFile.write("acc:" + str(accuracy) + "\n") trainLogFile.write("loss:" + str(loss) + "\n") def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) filter_writer = tf.summary.FileWriter('/path/to/logs', sess.graph) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \ params['batch_size'], \ params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: if len(train_batch) > 0: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 y_dev = [] y_pre = [] for dev_batch in dev_batches: if len(dev_batch) > 0: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) y_pre += predictions.tolist() y_dev += list(y_dev_batch) total_dev_correct += num_dev_correct y_devs = [ y_devItem.tolist().index(max(y_devItem.tolist())) for y_devItem in y_dev ] # print("y_pre:",y_pre) # print("y_devs:",y_devs) devRecall, devPrecision = getRP(y_pre, y_devs) logging.info( 'Recall and precision of dev set: {},{}'.format( devRecall, devPrecision)) accuracy = float(total_dev_correct) / len(y_dev) logging.info( 'Accuracy on dev set: {}'.format(accuracy)) lossItem = loss accuracyItem = accuracy with open("devCsv.csv", "a+", encoding="utf8") as csvFile: myWriter = csv.writer(csvFile) myWriter.writerow([ lossItem, accuracyItem, devRecall, devPrecision ]) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical( 'Saved model {} at step {}'.format( path, best_at_step)) logging.critical( 'Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: if len(test_batch) > 0: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): input_file = sys.argv[1] if os.path.exists('./data/x.p') and \ os.path.exists('./data/y.p') and \ os.path.exists('./data/vocabulary.p') and \ os.path.exists('./data/vocabulary_inv.p') and \ os.path.exists('./data/labels.p'): x_ = pickle.load(open("./data/x.p", "rb")) y_ = pickle.load(open("./data/y.p", "rb")) vocabulary = pickle.load(open("./data/vocabulary.p", "rb")) vocabulary_inv = pickle.load(open("./data/vocabulary_inv.p", "rb")) labels = pickle.load(open("./data/labels.p", "rb")) else: x_, y_, vocabulary, vocabulary_inv, _, labels = data_helper.load_data( input_file) training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a n dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary, dim=params['embedding_dim']) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) # optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=0.0005, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam') grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver(tf.global_variables()) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train i = 0 for train_batch in train_batches: logging.info('Training on batch: {}'.format(i)) x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) i += 1 logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) os.rename(path, trained_dir + 'best_model.ckpt') os.rename(path + '.meta', trained_dir + 'best_model.meta') shutil.rmtree(checkpoint_dir) logging.critical('{} has been removed'.format(checkpoint_dir)) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): # input_file=sys.argv[1] input_file = './data/simple3.csv' x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( input_file) #print(x_.shape)#(27404,489) #print(y_.shape)#(27404,10) #training_config=sys.argv[2] training_config = './training_config.json' params = json.loads(open(training_config).read()) #print(params) """ {'num_epochs': 1, 'num_filters': 32, 'max_pool_size': 4, 'l2_reg_lambda': 0.0, 'filter_sizes': '3,4,5', 'dropout_keep_prob': 0.5, 'non_static': False, 'evaluate_every': 200, 'hidden_unit': 300, 'batch_size': 128, 'embedding_dim': 300} """ word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) #timestamp = str(int(time.time())) #创建问夹准备把参数词典等中间必要东西村建 timestamp = str(int(time.time())) trained_dir = './trained_results_' + 'test' + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) #设置优化器OP和训练OP global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # 训练的时候保存模型 # checkpoint_dir = 'checkpoints_' + timestamp + '/' # if os.path.exists(checkpoint_dir): # shutil.rmtree(checkpoint_dir) # os.makedirs(checkpoint_dir) # checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): #batches ? return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] #训练 def train_step(x_batch, y_batch): #x_batch ? #y_batch ? # print(x_batch[1]) feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } #print("real_len:", len(real_len(x_batch))) _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) #测试 def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions #saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #训练准备 #根据batch_size计算每个train_batch的大小 train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) #print("y_train_batch:", y_train_batch[0]) train_step(x_train_batch, y_train_batch) #print("train_step", ) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step # path = saver.save(sess, checkpoint_prefix, global_step=current_step) # logging.critical('Saved model {} at step {}'.format(path, best_at_step)) # logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. # saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test #saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) print('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
max_length = max(max([len(x.split(" ")) for x in q1]), max([len(x.split(" ")) for x in q2])) vocab_processor = learn.preprocessing.VocabularyProcessor(max_length) print("max question length:", max_length) #converting to embedding matrix x_text = q1 + q2 vocab_ids = np.array(list(vocab_processor.fit_transform(x_text))) x1 = vocab_ids[:len(q1)] x2 = vocab_ids[len(q1):] print("Loading Word embeddings") vocab_dict = vocab_processor.vocabulary_._mapping pretrained_embeddings = data_helper.load_embeddings( FLAGS.embeddings_file, vocab_dict, FLAGS.embedding_dim, FLAGS.use_cached_embeddings) print("Shuffling Data:") np.random.seed(10) shuffled_index = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffled_index] x2_shuffled = x2[shuffled_index] y_shuffled = y[shuffled_index] q1_lenghts_shuffled = x1_length[shuffled_index] q2_lenghts_shuffled = x2_length[shuffled_index] print("Splitting Training/Validation data") validation_index = -1 * int(FLAGS.val_sample_percentage * float(len(y))) x1_training, x1_validation = x1_shuffled[:validation_index], x1_shuffled[ validation_index:]
type=int, help="length of sentence", default=256) parser.add_argument("--train_data", type=str, help="trianing data", default="data_use.txt") parser.add_argument("--config_file", type=str, help="training config", default="training_config.json") args = parser.parse_args() vocabulary = Vocabulary(args.train_data) word_embedding = data_helper.load_embeddings(vocabulary.String2i) embedding_mat = [ word_embedding[word] for index, word in enumerate(vocabulary.i2String) ] embedding_mat = np.array(embedding_mat, np.float32) ''' print embedding_mat.shape print embedding_mat[20] ''' with open(args.config_file) as f: params = json.load(f) #print params ''' {u'hidden_unit': 300, u'l2_reg_lambda': 0.0, u'dropout_keep_prob': 0.5, u'num_filters': 128, u'max_pool_size': 4, u'embedding_dim': 300, u'batch_size': 256, u'filter_sizes': u'3,4,5', u'evaluate_every': 100, u'non_static': False, u'num_epochs': 1}
def train_cnn_rnn(input_file, training_config): # read data and params x_, y_, vocabulary, vocabulary_inv, df, labels=data_helper.load_data(input_file) params=json.loads(open(training_config).read()) # create a directory, everything related to the training will be saved in this directory timestamp=str(int(time.time())) output_dir=os.path.join('data_path_save','cnn_rnn_'+timestamp) trained_dir=os.path.join(output_dir,'trained_results') if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) # assign a 300 dimension vector to each word word_embeddings=data_helper.load_embeddings(vocabulary) embedding_mat=[word_embeddings[word] for index,word in enumerate(vocabulary_inv)] embedding_mat=np.array(embedding_mat, dtype=np.float32) # split the original dataset into trainset and devset x_train, x_dev, y_train, y_dev=train_test_split(x_, y_, test_size=0.1) # split the trainset into trainset and devset logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev))) graph=tf.Graph() with graph.as_default(): session_conf=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess=tf.Session(config=session_conf) with sess.as_default(): cnn_rnn=TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step=tf.Variable(0, name='global_step', trainable=False) optimizer=tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars=optimizer.compute_gradients(cnn_rnn.loss) train_op=optimizer.apply_gradients(grads_and_vars, global_step=global_step) checkpoint_dir=os.path.join(output_dir,'checkpoints') if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix=os.path.join(checkpoint_dir, 'model') def real_len(batches): return [np.ceil(np.argmin(batch+[0])*1.0/params['max_pool_size']) for batch in batches] def train_step(x_batch, y_batch): feed_dict={ cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch) } _, step, loss, accuracy=sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict=feed_dict) def dev_step(x_batch, y_batch): feed_dict={ cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch) } step, loss, accuracy, num_correct, predictions=sess.run([global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict=feed_dict) return accuracy, loss, num_correct, predictions saver=tf.train.Saver() sess.run(tf.global_variables_initializer()) # training starts here train_batches=data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step=0, 0 for train_batch in train_batches: x_train_batch, y_train_batch=zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step=tf.train.global_step(sess, global_step) if current_step%params['evaluate_every']==0: dev_batches=data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct=0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch=zip(*dev_batch) acc, loss, num_dev_correct, predictions=dev_step(x_dev_batch, y_dev_batch) total_dev_correct+=num_dev_correct accuracy=float(total_dev_correct)/len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy>=best_accuracy: best_accuracy, best_at_step=accuracy, current_step path=saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format(path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step)) logging.critical('Training is complete, testing the best model on x_test and y_test') # save trained params and files with open(trained_dir+'/words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir+'/embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir+'/labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length']=x_train.shape[1] with open(trained_dir+'/trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): input_file = "logstashTemp.dat" output_file = "logstash.csv" dataList = [] with open(input_file, 'r', encoding='utf8') as logFile: for row in logFile: dataList.append(json.loads(row)) keyList = list(dataList[0].keys()) csvList = [[keyItem for keyItem in keyList]] for row in dataList: if "severity" in list(row.keys()): tempRow = [ row[keyItem] for keyItem in keyList if keyItem in list(row.keys()) ] csvList.append(tempRow) with open(output_file, "w+", encoding="utf8") as csvFile: for row in csvList: myWriter = csv.writer(csvFile) myWriter.writerow(row) x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( output_file, 20000) training_config = "training_config.json" params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' print(trained_dir) if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \ params['batch_size'], \ params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: print("׼ȷÂÊ£º", accuracy) best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
# load_and_train_ref_neighbor(has_rank=True) load_and_train_ref_abs(has_rank=True) # embedding_path = '../data/glove.42B.300d.50K.w2v.txt' # embeddings, vocab, embedding_size=load_embeddings(embedding_path, 100000) # X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label('../data/dataset_abstract_stat_50.npy', embeddings, vocab) # produce_test_prediciton('../models/abstract_cnn_baseline', X_test, '../outputs/abstract_cnn_baseline') # X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_ref_chain_to_label('../data/dataset_ref_chain_stat_50.npy') # produce_test_prediciton('../models/ref_chain_cnn_baseline', X_test, '../outputs/ref_chain_cnn_baseline') # produce_test_prediciton('../models/ref_chain_cnn_with_rank', X_test, '../outputs/ref_chain_cnn_with_rank') # X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_ref_nb_to_label('../data/dataset_ref_nb_stat_50.npy') # produce_test_prediciton('../models/ref_neighbor_cnn_baseline', X_test, '../outputs/ref_neighbor_cnn_baseline') # produce_test_prediciton('../models/ref_neighbor_cnn_with_rank', X_test, '../outputs/ref_neighbor_cnn_with_rank') embedding_path = '../data/glove.42B.300d.50K.w2v.txt' embeddings, vocab, embedding_size = load_embeddings(embedding_path, 100000) X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label( '../data/dataset_abstract_stat_50.npy', embeddings, vocab) label2idx = json.load(open('../data/journal2idx.json', 'r')) label_num = len(label2idx) + 1 journal2idx_all = json.load(open('../data/journal2idx_all.json', 'r')) X_train_ref, _, _, X_val_ref, _, _, X_test_ref, _, _ = load_ref_chain_to_label( '../data/dataset_ref_chain_stat_50.npy') produce_test_prediciton( '../models/ref_abs_cnn_with_rank', [np.array(X_test), np.array(X_test_ref)], '../outputs/ref_abs_with_rank')
def train_cnn(): input_dir = sys.argv[1] x_train, x_dev, x_test, pos1_train, pos2_train, pos1_dev, pos2_dev, pos1_test, pos2_test, y_train, y_dev, y_test, vocabulary, vocabulary_inv, labels = data_helper.load_data_split_sents( input_dir) training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) #sentence_length = 200 pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( 200) pos_vocab_processor.fit(pos1_train + pos2_train + pos1_dev + pos2_dev + pos1_test + pos2_test) pos1_train_vec = np.array(list(pos_vocab_processor.transform(pos1_train))) pos2_train_vec = np.array(list(pos_vocab_processor.transform(pos2_train))) pos1_dev_vec = np.array(list(pos_vocab_processor.transform(pos1_dev))) pos2_dev_vec = np.array(list(pos_vocab_processor.transform(pos2_dev))) pos1_test_vec = np.array(list(pos_vocab_processor.transform(pos1_test))) pos2_test_vec = np.array(list(pos_vocab_processor.transform(pos2_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextLSTMCNN( embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], pos_vocab_size=len(pos_vocab_processor.vocabulary_), pos_embedding_size=params['position_embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x1_batch, pos1_batch, pos2_batch, y_batch): feed_dict = { cnn.input_x1: x1_batch, cnn.input_pos1: pos1_batch, cnn.input_pos2: pos2_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'], cnn.batch_size: len(x1_batch), cnn.pad: np.zeros([len(x1_batch), 1, params['embedding_dim'], 1]), #cnn.pad_pos: np.zeros([len(x1_batch), 1, params['embedding_dim']+2*params['position_embedding_dim'], 1]), cnn.real_len: real_len(x1_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict) def dev_step(x1_batch, pos1_batch, pos2_batch, y_batch): feed_dict = { cnn.input_x1: x1_batch, cnn.input_pos1: pos1_batch, cnn.input_pos2: pos2_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0, cnn.batch_size: len(x1_batch), cnn.pad: np.zeros([len(x1_batch), 1, params['embedding_dim'], 1]), #cnn.pad_pos: np.zeros([len(x1_batch), 1, params['embedding_dim']+2*params['position_embedding_dim'], 1]), cnn.real_len: real_len(x1_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn.loss, cnn.accuracy, cnn.num_correct, cnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter( list(zip(x_train, pos1_train_vec, pos2_train_vec, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_left_train and y_train print "len(train_batches): ", train_batches prev_test_set_accuracy = 0.0 for train_batch in train_batches: #print train_batch if train_batch.shape[0] > 0: x_train_batch, pos1_train_batch, pos2_train_batch, y_train_batch = zip( *train_batch) train_step(x_train_batch, pos1_train_batch, pos2_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_left_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, pos1_dev_vec, pos2_dev_vec, y_dev)), params['batch_size'], 1) total_dev_correct = 0 count_y_dev = 0 for dev_batch in dev_batches: if dev_batch.shape[0] > 0: x_dev_batch, pos1_dev_batch, pos2_dev_batch, y_dev_batch = zip( *dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, pos1_dev_batch, pos2_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct count_y_dev = count_y_dev + len(dev_batch) accuracy = float(total_dev_correct) / count_y_dev logging.info( 'Accuracy on dev set: {}'.format(accuracy)) test_batches = data_helper.batch_iter( list( zip(x_test, pos1_test_vec, pos2_test_vec, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 count_y_test = 0 for test_batch in test_batches: if test_batch.shape[0] > 0: x_test_batch, pos1_test_batch, pos2_test_batch, y_test_batch = zip( *test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, pos1_test_batch, pos2_test_batch, y_test_batch) total_test_correct += int(num_test_correct) count_y_test = count_y_test + len(test_batch) test_set_acc = float(total_test_correct) / count_y_test logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / count_y_test)) if test_set_acc > prev_test_set_accuracy: prev_test_set_accuracy = test_set_acc best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical( 'Saved model {} at step {}'.format( path, best_at_step)) logging.critical( 'Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / count_y_test)) logging.critical( 'Training is complete, testing the best model on x_left_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_left_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list( zip(x_test, pos1_test_vec, pos2_test_vec, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 count_y_test = 0 for test_batch in test_batches: if test_batch.shape[0] > 0: x_test_batch, pos1_test_batch, pos2_test_batch, y_test_batch = zip( *test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, pos1_test_batch, pos2_test_batch, y_test_batch) total_test_correct += int(num_test_correct) count_y_test = count_y_test + len(test_batch) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / count_y_test)) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): print('------------------------------++ ', 'begin trainin') input_file = sys.argv[1] x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( input_file) training_config = sys.argv[2] params = json.loads(open(training_config, encoding='utf-8').read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.4) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.4) print('------------------------------++ ', 'end loading dataset') logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) #timestamp = '1524692100' builder = tf.saved_model.builder.SavedModelBuilder('./SavedModelB') trained_dir = './SavedModelB/' print('------------------------------++ ', trained_dir, 'created !') #os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): print('------------------------------++ ', 'begin building graph') session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): print('------------------------------++ ', 'begin initializing our cnn object') a = max(vocabulary, key=lambda i: vocabulary[i]) nv = np.chararray((vocabulary[a] + 1), itemsize=40, unicode=True) nv[:] = '' for a in vocabulary.keys(): nv[vocabulary[a]] = a cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=33, num_classes=y_train.shape[1], dictionnaire=nv, filter_sizes=map( int, params['filter_sizes'].split(","))) print('------------------------------++ ', 'End initializing our cnn object') global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) print('------------------------------++ ', 'vars was intialized') # Checkpoint files will be saved in this directory during training checkpoint_dir = './SavedModelB/' #os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): print(len(x_batch), len(x_batch[0])) feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) print('------------------------------++ ', 'sess was initilaized with initializing all the variables') # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train print('------------------------------++ ', 'Step traing begin :') i = 0 rrr = 0 for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) print( '--------------------------------------------------------++ ', 'nv step BEGININS i= ', i) i = i + 1 train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 j = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) print( '--------------------------------------------------------++ ', 'nv step ININ j= ', j) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) j = j + 1 total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) if rrr == 10: break else: rrr = rrr + 1 ####################################################################################################### logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # defining the signature of the graph to_input_x = tf.saved_model.utils.build_tensor_info( cnn_rnn.string_to_manipulate) to_predictions = tf.saved_model.utils.build_tensor_info( cnn_rnn.predictions) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'to_input_x': to_input_x}, outputs={'to_predictions': to_predictions}, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature }, assets_collection=None, legacy_init_op=legacy_init_op) builder.save() # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w', encoding='utf8') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w', encoding='utf8') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w', encoding='utf8') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)