def predict(): #trained_dir = sys.argv[1] trained_dir = './trained_results_1575177514/' #if not trained_dir.endswith('/'): # trained_dir += '/' #test_file = sys.argv[2] if request.method == 'POST': file = request.files['ReceivedFile'] logging.critical('Received Filename from App: {}'.format(file)) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) test_file = './uploads/'+filename df = pd.read_csv(test_file) text = df[['Descript']] params, words_index, labels, embedding_mat = load_trained_params(trained_dir) flag = 0 x_, y_, df = load_test_data(test_file, labels,flag) x_ = data_helper.pad_sentences(x_, forced_sequence_length=params['sequence_length']) x_ = map_word_to_index(x_, words_index) else: #text = request.json["text"] text =request.form["query"] params, words_index, labels, embedding_mat = load_trained_params(trained_dir) flag = 1 x_, y_, df = load_test_data(text, labels,flag) x_ = data_helper.pad_sentences(x_, forced_sequence_length=params['sequence_length']) x_ = map_word_to_index(x_, words_index) x_test, y_test = np.asarray(x_), None if y_ is not None: y_test = np.asarray(y_) timestamp = trained_dir.split('/')[-2].split('_')[-1] predicted_dir = './predicted_results_' + timestamp + '/' if os.path.exists(predicted_dir): shutil.rmtree(predicted_dir) os.makedirs(predicted_dir) with tf.Graph().as_default(): session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.compat.v1.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat = embedding_mat, non_static = params['non_static'], hidden_unit = params['hidden_unit'], sequence_length = len(x_test[0]), max_pool_size = params['max_pool_size'], filter_sizes = map(int, params['filter_sizes'].split(",")), num_filters = params['num_filters'], num_classes = len(labels), embedding_size = params['embedding_dim'], l2_reg_lambda = params['l2_reg_lambda']) def real_len(batches): return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches] def predict_step(x_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } predictions = sess.run([cnn_rnn.predictions], feed_dict) return predictions checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.compat.v1.train.Saver(tf.compat.v1.all_variables()) saver = tf.compat.v1.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False) predictions, predict_labels = [], [] for x_batch in batches: batch_predictions = predict_step(x_batch)[0] for batch_prediction in batch_predictions: predictions.append(batch_prediction) predict_labels.append(labels[batch_prediction]) logging.critical('Prediction is complete Class belongs to: {}'.format(predict_labels[0])) #if os.path.exists(test_file): # os.remove(test_file) return render_template('results.html',prediction = predict_labels[0],name =text)
def train(): config = Config() data_loader = Loader() x_raw, y_raw, word_to_id, labels = data_loader.load_data( 'data/companies.jsons') word_embeddings = data_loader.load_embeddings(config.embedding_size) id_to_word = data_loader.invert_vocab(word_to_id) embedding_mat = [ word_embeddings[word] for index, word in enumerate(word_to_id) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) x, x_test, y, y_test = train_test_split(x_raw, y_raw, test_size=0.1) x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) timestamp = str(int(time.time())) trained_dir = 'save/trained_results' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=config.non_static, hidden_unit=config.hidden_size, max_pool_size=config.max_pool_size, filter_sizes=map( int, config.filter_sizes.split(",")), num_filters=config.num_filters, embedding_size=config.embedding_size, l2_reg_lambda=config.l2_reg_lambda) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) checkpoint_dir = 'save/checkpoints' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / config.max_pool_size) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: config.dropout_keep_prob, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, config.embedding_size, 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy, num_correct = sess.run([ train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct ], feed_dict) return num_correct def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, config.embedding_size, 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) train_batches = batch_iter(list(zip(x_train, y_train)), config.batch_size, config.epochs) best_accuracy, best_at_stp = 0, 0 total_train_correct = 0 for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) num_correct = train_step(x_train_batch, y_train_batch) total_train_correct += num_correct current_step = tf.train.global_step(sess, global_step) accuracy_train = float(num_correct) / len(y_train_batch) print("train accuracy ", accuracy_train) # Evaluate the model with x_dev and y_dev if current_step % config.evaluate_every == 0: dev_batches = batch_iter(list(zip(x_dev, y_dev)), config.batch_size, 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) print(accuracy) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) print('Best accuracy : ', best_accuracy) saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = batch_iter(list(zip(x_test, y_test)), 1, 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) print('Accuracy on test set ', float(total_test_correct) / len(y_test)) print('a')
def test_data(): trained_dir = sys.argv[1] if not trained_dir.endswith('/'): trained_dir += '/' test_file = sys.argv[2] params, words_index, labels, embedding_mat = load_trained_params( trained_dir) x_, y_, df = load_test_data(test_file, labels) x_ = data_helper.pad_sentences( x_, forced_sequence_length=params['sequence_length']) x_ = map_word_to_index(x_, words_index) x_test, y_test = np.asarray(x_), None if y_ is not None: y_test = np.asarray(y_) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # logging.info(params) cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=len(x_test[0]), max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], num_classes=len(labels), embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } loss, num_correct, predictions = sess.run( [cnn_rnn.loss, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict) return loss, num_correct, predictions checkpoint_file = trained_dir + 'model-4' saver = tf.train.Saver(tf.global_variables()) logging.info(checkpoint_file) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) total_test_correct = 0 loss, num_test_correct, predictions = dev_step(x_test, y_test) logging.info(num_test_correct) logging.info(predictions) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test)))
param_length = 40 trained_dir = "trained_results/" params, words_index, labels, embedding_mat = load_trained_params(trained_dir) with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat = embedding_mat, non_static = params['non_static'], hidden_unit = params['hidden_unit'], sequence_length = param_length, max_pool_size = params['max_pool_size'], filter_sizes = map(int, params['filter_sizes'].split(",")), num_filters = params['num_filters'], num_classes = len(labels), embedding_size = params['embedding_dim'], l2_reg_lambda = params['l2_reg_lambda']) checkpoint_file = trained_dir + 'model-3100' saver = tf.train.Saver(tf.all_variables()) saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) def real_len(batches): return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]
def predict_unseen_data(): test_x = [] #test_input = os.environ.get('TEST_X', None) test_input = "What time is the class" if test_input is None: logging.critical(' TEST_X is not found ') sys.exit() test_x.append(test_input.split(' ')) trained_dir = "trained_results_1512435063" #os.environ.get('TRAINED_RESULTS', None) if trained_dir is None: logging.critical(' TRAINED_RESULTS is not found ') sys.exit() if not trained_dir.endswith('/'): trained_dir += '/' x_ = data_helper.pad_sentences(test_x, forced_sequence_length=params['sequence_length']) x_ = map_word_to_index(x_, words_index) x_test, y_test = np.asarray(x_), None timestamp = trained_dir.split('/')[-2].split('_')[-1] with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat = embedding_mat, non_static = params['non_static'], hidden_unit = params['hidden_unit'], sequence_length = len(x_test[0]), max_pool_size = params['max_pool_size'], filter_sizes = map(int, params['filter_sizes'].split(",")), num_filters = params['num_filters'], num_classes = len(labels), embedding_size = params['embedding_dim'], l2_reg_lambda = params['l2_reg_lambda']) def real_len(batches): return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches] def predict_step(x_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } scores,predictions = sess.run([cnn_rnn.scores,cnn_rnn.predictions], feed_dict) return scores,predictions checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.train.Saver(tf.all_variables()) saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False) response="" predictions, predict_labels = [], [] for x_batch in batches: scores,batch_predictions = predict_step(x_batch) print scores score=normalize(scores[0]) print score print score.max() mscore=score.max() range_perc = 0.01 max_range = mscore + (mscore * range_perc) min_range = mscore - (mscore * range_perc) for s in score: if(s > min_range and s < max_range) max_score = score.max() if(max_score>0.1): print scores for batch_prediction in batch_predictions: predictions.append(batch_prediction) predict_labels.append(labels[batch_prediction]) response= predict_labels[0] else: response="Fall back!" sys.stdout.write(response) print response os.environ['PRED_LABEL'] = response
def demo_cnn_rnn(demo_model): # load training parameters params, words_index, labels, embedding_mat=load_trained_params('data_path_save/cnn_rnn_'+demo_model+'/trained_results/') with tf.Graph().as_default(): session_conf=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess=tf.Session(config=session_conf) with sess.as_default(): cnn_rnn=TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=params['sequence_length'], max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], num_classes=len(labels),embedding_size=params['embedding_dim'],l2_reg_lambda=params['l2_reg_lambda']) def real_len(batches): return [np.ceil(np.argmin(batch+[0])*1.0/params['max_pool_size']) for batch in batches] def predict_step(x_batch): feed_dict={ cnn_rnn.input_x: x_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch) } predictions=sess.run([cnn_rnn.predictions], feed_dict=feed_dict) return predictions checkpoint_file=tf.train.latest_checkpoint('data_path_save/cnn_rnn_'+demo_model+'/checkpoints/') saver=tf.train.Saver(tf.all_variables()) saver=tf.train.import_meta_graph('{}.meta'.format(checkpoint_file)) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) while(1): print('Please input your sentence:') input_sentence = input() if input_sentence == '' or input_sentence.isspace(): print('See you next time!') break else: x_=data_helper.clean_str(input_sentence).split(' ') # Prediction: cut off the sentence if it is longer than the sequence length sequence_length=params['sequence_length'] num_padding=sequence_length-len(x_) padded_sentence=[] if num_padding<0: logging.info('This sentence has to be cut off because it is longer than trained sequence length') padded_sentence=x_[0: sequence_length] else: padded_sentence=x_+['<PAD/>']*num_padding # Get word index temp=[] for word in padded_sentence: if word in words_index: temp.append(words_index[word]) else: temp.append(0) temp=np.asarray(temp) x_test=np.expand_dims(temp, axis=0) prediction=predict_step(x_test)[0][0] predicted_label=labels[prediction] print('\n疾病类别: '+predicted_label+'\n')
def train_cnn_rnn(): input_file = "logstashTemp.dat" output_file = "logstash.csv" dataList = [] with open(input_file, 'r', encoding='utf8') as logFile: for row in logFile: dataList.append(json.loads(row)) keyList = list(dataList[0].keys()) csvList = [[keyItem for keyItem in keyList]] for row in dataList: if "severity" in list(row.keys()): tempRow = [ row[keyItem] for keyItem in keyList if keyItem in list(row.keys()) ] csvList.append(tempRow) with open(output_file, "w+", encoding="utf8") as csvFile: for row in csvList: myWriter = csv.writer(csvFile) myWriter.writerow(row) x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( output_file, 20000) training_config = "training_config.json" params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' print(trained_dir) if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \ params['batch_size'], \ params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: print("׼ȷÂÊ£º", accuracy) best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): x_, y_, x_test, y_test, vocabulary, vocabulary_inv, labels = data_helper.load_data( ) #x_, y_, vocabulary, vocabulary_inv, labels = data_helper.load_data_book() training_config = 'training_config.json' params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [] for i in range(len(vocabulary_inv)): embedding_mat.append(word_embeddings[vocabulary_inv[i]]) embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set # Split the train set into train set and dev set # IMDB style # x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1) # Book data style #x_, x_test, y_, y_test = train_test_split(x_, y_, test_size=0.1) x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=[ int(x) for x in params['filter_sizes'].split(",") ], num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) #optimizer = tf.train.MomentumOptimizer(0.1, 0.9) optimizer = tf.train.AdamOptimizer() grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn_rnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn_rnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } summaries, _, step, loss, accuracy = sess.run([ train_summary_op, train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) # print(accuracy) return accuracy def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } summaries, step, loss, accuracy, num_correct, predictions = sess.run( [ dev_summary_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) dev_summary_writer.add_summary(summaries, step) print("step {}, loss {:g}, acc {:g}".format( step, loss, accuracy)) return accuracy, predictions sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_dev_accuracy, best_at_step = 0, 0 best_test_accuracy = 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_acc = train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: print("Training Accuracy:", train_acc, end=' ') print("Evaluation:", end=' ') dev_acc, _ = dev_step(x_dev, y_dev) print("Test:", end=' ') test_acc_tmp, pred__ = dev_step(x_test, y_test) # with open('results/prediction' + str(current_step), 'bw') as f: # pickle.dump(pred__, f) if dev_acc > best_dev_accuracy: best_dev_accuracy = dev_acc best_test_accuracy = test_acc_tmp print('best dev accuracy is', best_dev_accuracy, 'the test is', best_test_accuracy) print( 'Training is complete, testing the best model on x_test and y_test' ) # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) # os.rename(path, trained_dir + 'best_model.ckpt') # os.rename(path + '.meta', trained_dir + 'best_model.meta') shutil.rmtree(checkpoint_dir) logging.critical('{} has been removed'.format(checkpoint_dir)) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): input_file = sys.argv[1] x_, y_ = data_helper.load_data(input_file) config = sys.argv[2] params = json.loads(open(config).read()) params['embedding_dim'] = params['tick_size'] * params['feature_size'] # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './result/trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './result/checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) os.rename(path + '.index', trained_dir + 'best_model.ckpt') os.rename(path + '.meta', trained_dir + 'best_model.meta') shutil.rmtree(checkpoint_dir) logging.critical('{} has been removed'.format(checkpoint_dir)) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): input_file = sys.argv[1] if os.path.exists('./data/x.p') and \ os.path.exists('./data/y.p') and \ os.path.exists('./data/vocabulary.p') and \ os.path.exists('./data/vocabulary_inv.p') and \ os.path.exists('./data/labels.p'): x_ = pickle.load(open("./data/x.p", "rb")) y_ = pickle.load(open("./data/y.p", "rb")) vocabulary = pickle.load(open("./data/vocabulary.p", "rb")) vocabulary_inv = pickle.load(open("./data/vocabulary_inv.p", "rb")) labels = pickle.load(open("./data/labels.p", "rb")) else: x_, y_, vocabulary, vocabulary_inv, _, labels = data_helper.load_data( input_file) training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a n dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary, dim=params['embedding_dim']) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) # optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=0.0005, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam') grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver(tf.global_variables()) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train i = 0 for train_batch in train_batches: logging.info('Training on batch: {}'.format(i)) x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) i += 1 logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) os.rename(path, trained_dir + 'best_model.ckpt') os.rename(path + '.meta', trained_dir + 'best_model.meta') shutil.rmtree(checkpoint_dir) logging.critical('{} has been removed'.format(checkpoint_dir)) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): input_file = "logstashTemp.dat" output_file = "wcData85_1.csv" # with open(input_file,"r",encoding="utf8") as datFile: # jsonDict=json.loads(datFile.readline()) # with open(input_file,"r",encoding="utf8") as datFile: # jsonDf=pd.DataFrame([],columns=list(jsonDict.keys())) # rowNO=0 # for row in datFile.readlines(): # try: # jsonDf.loc[rowNO]=list(json.loads(row).values()) # except json.decoder.JSONDecodeError as ex: # print(ex.tostring) # rowNO+=1 # jsonDf.to_csv(output_file) print("loading data...") x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data3( output_file, ["crit", "err"], 10000) # print("y_:",y_) training_config = "training_config.json" params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' print(trained_dir) if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) #global_step will control the changes of grads_and_vars with # the change of itself which caused by optimizer.apply_gradients() optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3, decay=0.9) #initiate the optimizer whose learning_rate is firstly 1e-3 # but it will be decreased along with the change of decay in the folume below: # decayed_learning_rate = learning_rate*decay_rate^(global_step/decay_steps) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) #compute gradients of loss train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) #apply the gradients to variables and change them # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) print(step, "trainAccuracy", accuracy) with open("trainLogCsv.txt", "a+", encoding="utf8") as trainLogFile: trainLogFile.write("=========" + str(step) + "=========\n") trainLogFile.write("acc:" + str(accuracy) + "\n") trainLogFile.write("loss:" + str(loss) + "\n") def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) filter_writer = tf.summary.FileWriter('/path/to/logs', sess.graph) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \ params['batch_size'], \ params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: if len(train_batch) > 0: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 y_dev = [] y_pre = [] for dev_batch in dev_batches: if len(dev_batch) > 0: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) y_pre += predictions.tolist() y_dev += list(y_dev_batch) total_dev_correct += num_dev_correct y_devs = [ y_devItem.tolist().index(max(y_devItem.tolist())) for y_devItem in y_dev ] # print("y_pre:",y_pre) # print("y_devs:",y_devs) devRecall, devPrecision = getRP(y_pre, y_devs) logging.info( 'Recall and precision of dev set: {},{}'.format( devRecall, devPrecision)) accuracy = float(total_dev_correct) / len(y_dev) logging.info( 'Accuracy on dev set: {}'.format(accuracy)) lossItem = loss accuracyItem = accuracy with open("devCsv.csv", "a+", encoding="utf8") as csvFile: myWriter = csv.writer(csvFile) myWriter.writerow([ lossItem, accuracyItem, devRecall, devPrecision ]) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical( 'Saved model {} at step {}'.format( path, best_at_step)) logging.critical( 'Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: if len(test_batch) > 0: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): # input_file=sys.argv[1] input_file = './data/simple3.csv' x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( input_file) #print(x_.shape)#(27404,489) #print(y_.shape)#(27404,10) #training_config=sys.argv[2] training_config = './training_config.json' params = json.loads(open(training_config).read()) #print(params) """ {'num_epochs': 1, 'num_filters': 32, 'max_pool_size': 4, 'l2_reg_lambda': 0.0, 'filter_sizes': '3,4,5', 'dropout_keep_prob': 0.5, 'non_static': False, 'evaluate_every': 200, 'hidden_unit': 300, 'batch_size': 128, 'embedding_dim': 300} """ word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) #timestamp = str(int(time.time())) #创建问夹准备把参数词典等中间必要东西村建 timestamp = str(int(time.time())) trained_dir = './trained_results_' + 'test' + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) #设置优化器OP和训练OP global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # 训练的时候保存模型 # checkpoint_dir = 'checkpoints_' + timestamp + '/' # if os.path.exists(checkpoint_dir): # shutil.rmtree(checkpoint_dir) # os.makedirs(checkpoint_dir) # checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): #batches ? return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] #训练 def train_step(x_batch, y_batch): #x_batch ? #y_batch ? # print(x_batch[1]) feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } #print("real_len:", len(real_len(x_batch))) _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) #测试 def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions #saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #训练准备 #根据batch_size计算每个train_batch的大小 train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) #print("y_train_batch:", y_train_batch[0]) train_step(x_train_batch, y_train_batch) #print("train_step", ) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step # path = saver.save(sess, checkpoint_prefix, global_step=current_step) # logging.critical('Saved model {} at step {}'.format(path, best_at_step)) # logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. # saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test #saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) print('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_rnn_cnn(): training_config = "./training_config.json" params = json.loads(open(training_config).read()) config = "./data" trainfile = os.path.join(config, "travelogueData") stopwordfile = os.path.join(config, "stopwords.txt") w2vmodel_file = os.path.join(config, "googleVectorMR.bin") x_train, y_train, x_dev, y_dev, embedding_mat = data_help2.load_data2( w2vmodel_file, params['dev_size']) logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev))) logging.info('y_train: {}, y_dev: {}'.format(len(y_train), len(y_dev))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './runs/trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], num_layers=params['num_layers'], max_pool_size=params['max_pool_size'], hidden_unit=params['hidden_unit'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './runs/checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) # Training starts here train_batches = data_help2.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_stp = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_help2.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 predictList = list() real_label = list() for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct predictList.extend(list(predictions)) real_label.extend(list(y_dev_batch)) # TP = len([ i for i in range(len(predictList)) if predictList[i] == 1 and real_label[i][1] == 1]) # TN = len([ i for i in range(len(predictList)) if predictList[i] == 0 and real_label[i][1] == 1]) # FP = len([ i for i in range(len(predictList)) if predictList[i] == 1 and real_label[i][1] == 0]) # right = float(TP) / (TP + FP) # recall = float(TP) / (TP + TN) # F_value = 2 * right * recall / (right + recall) accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' )
def predict_unseen_data(): trained_dir = sys.argv[1] if not trained_dir.endswith('/'): trained_dir += '/' filepath = './data/predict' params, words_index, labels, embedding_mat = load_trained_params( trained_dir) x_, filename_ = load_test_data(filepath) x_ = parser.pad_sentences(x_, forced_sequence_length=params['sequence_length']) x_ = map_word_to_index(x_, words_index) #一行一个索引数组 x_test, filename_test = np.asarray(x_), np.asarray(filename_) #x_test, y_test = np.asarray(x_), None #if y_ is not None: # y_test = np.asarray(y_) timestamp = trained_dir.split('/')[-2].split('_')[-1] predicted_dir = './predicted_results_' + timestamp + '/' if os.path.exists(predicted_dir): shutil.rmtree(predicted_dir) os.makedirs(predicted_dir) with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=len(x_test[0]), max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], num_classes=len(labels), embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def predict_step(x_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } score, predictions = sess.run( [cnn_rnn.scores, cnn_rnn.predictions], feed_dict) return score, predictions def proba(x): e_x = np.exp(x - np.max(x)) return e_x / e_x.sum() checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.train.Saver(tf.all_variables()) saver = tf.train.import_meta_graph("{}.meta".format( checkpoint_file[:-5])) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) predict_labels_index, predict_labels, predict_filename,probabs = [], [],[],[] #batch不为1的写法 batches = parser.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False) for x_batch in batches: tmp = predict_step(x_batch) for indexOfLable in tmp[1]: predict_labels_index.append(indexOfLable) predict_labels.append(labels[indexOfLable]) for score in tmp[0]: probabs.append(proba(score).max()) batches2 = parser.batch_iter(list(filename_test), params['batch_size'], 1, shuffle=False) for tmp in batches2: for filename in tmp: predict_filename.append(filename) #batch为1的简化写法 #for indexOfLable in predict_step(x_test)[1]: # predict_labels_index.append(indexOfLable) # predict_labels.append(labels[indexOfLable]) #for score in predict_step(x_test)[0]: # probabs.append(proba(score).max()) #for filename in filename_test: # predict_filename.append(filename) infoList = [] for i in range(len(predict_labels_index)): info = {} info["prob"] = float(probabs[i]) info["sampleId"] = predict_filename[i] info["label"] = predict_labels[i] infoList.append(info) allJson = {} allJson["type"] = "Fiance Product Classifcation" allJson["result"] = infoList with codecs.open(predicted_dir + 'predictions_all.json', 'w', encoding="utf-8") as outfile: json.dump(allJson, outfile, indent=4, ensure_ascii=False) #df['PREDICTED'] = predict_labels #df.to_json(path_or_buf=predicted_dir + 'predictions_all.json', orient='records', lines=True) #if y_test is not None: # y_test = np.array(np.argmax(y_test, axis=1)) # accuracy = sum(np.array(predictions) == y_test) / float(len(y_test)) # logging.critical('The prediction accuracy is: {}'.format(accuracy)) logging.critical( 'Prediction is complete, all files have been saved: {}'.format( predicted_dir))
def train_cnn_rnn(input_file,training_config): epochs=10 # input_file = sys.argv[1] x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(input_file) # training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)] embedding_mat = np.array(embedding_mat, dtype = np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1, random_state=16) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=16) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes = y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters = params['num_filters'], embedding_size = params['embedding_dim'], l2_reg_lambda = params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run( [global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver(tf.all_variables()) sess.run(tf.initialize_all_variables()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for epoch in range(epochs): for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step(x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format(path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step)) logging.critical('Training is complete, testing the best model on x_test and y_test') # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) acc, loss, num_test_correct, predictions = dev_step(x_test, y_test) from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score y_test=[np.argmax(y_t) for y_t in y_test] print(sorted(list(set(y_test)))) recall_l=recall_score(y_test,predictions,average=None) f1_score=f1_score(y_test,predictions,average=None) acc_score=accuracy_score(y_test,predictions) total_test_correct = int(num_test_correct) logging.critical('Recall on test set: '+str(recall_l)) logging.critical('Acc on test set: '+str(acc_score)) logging.critical('F1 on test set: '+str(f1_score)) logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test))) print(len(labels)) print(len(recall_l)) print(len(f1_score)) labels_=[labels[n] for n in sorted(list(set(y_test)))] logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test))) df_=pd.DataFrame();df_["labels"]=labels_;df_["recall"]=recall_l;df_["f1"]=f1_score;df_.to_csv("matrics.csv",index=False) # Save trained parameters and files since predict.py needs them #print (vocabulary) with open(trained_dir + 'words_index.json', 'w') as outfile: #jsObj = json.dumps(vocabulary) #outfile.write(jsObj) #outfile.close() json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def predict_raw_data(argv): ################################################################### # Make sure to enter the trained_dir you want to load # # # # AT predicted_dir, there is predict_labels.txt # # which store the prediction result of test file # # # ################################################################### in_file = '' out_file = '' try: opts, args = getopt.getopt( argv, "h:t:i:o:", ["trained_dir=", "in_filepath=", "out_filepath="]) except getopt.GetoptError: print( "python main.py -i <in_filepath> -o <out_filepath> -t <trained_dir>" ) sys.exit(2) trained_dir = './trained_results/' for opt, arg in opts: if opt == '-h': print("python main.py -i <in_filepath> -o <out_filepath>") sys.exit() elif opt in ("-i", "--in_filepath"): in_file = arg elif opt in ("-o", "--out_filepath"): out_file = arg elif opt in ("-t", "--trained_dir"): trained_dir = arg params, words_index, labels, embedding_mat = load_trained_params( trained_dir) original_x, x_, y_ = load_test_data(in_file) x_ = data_helper.pad_sentences( x_, forced_sequence_length=params['sequence_length']) x_ = map_word_to_index(x_, words_index) x_test, y_test = np.asarray(x_), None if y_ is not None: y_test = np.asarray(y_) with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=len(x_test[0]), max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], num_classes=len(labels), embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def predict_step(x_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } predictions, scores = sess.run( [cnn_rnn.predictions, cnn_rnn.scores], feed_dict) return predictions, scores checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.train.Saver(tf.global_variables()) saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False) predictions, predict_labels, predict_probs = [], [], [] for x_test_batch in batches: batch_predictions = predict_step(x_test_batch)[0] batch_prop_preds = predict_step(x_test_batch)[1] for batch_prediction, batch_prop_pred in zip( batch_predictions, batch_prop_preds): predictions.append(batch_prediction) predict_labels.append(labels[batch_prediction]) predict_probs.append(batch_prop_pred[batch_prediction]) with open(out_file, "w", encoding='utf-8') as f: for original_x_, predict_label, predict_prob in zip( original_x, predict_labels, predict_probs): print_prob = round(predict_prob * 100, 2) f.write( str(original_x_) + '\t' + str(predict_label) + '\t' + str(print_prob) + '\n') if y_test is not None: y_test = np.array(np.argmax(y_test, axis=1)) accuracy = sum(np.array(predictions) == y_test) / float( len(y_test)) logging.critical( 'The prediction accuracy is: {}'.format(accuracy)) logging.critical('Prediction is complete')
def train_cnn_rnn(input_file, training_config): # read data and params x_, y_, vocabulary, vocabulary_inv, df, labels=data_helper.load_data(input_file) params=json.loads(open(training_config).read()) # create a directory, everything related to the training will be saved in this directory timestamp=str(int(time.time())) output_dir=os.path.join('data_path_save','cnn_rnn_'+timestamp) trained_dir=os.path.join(output_dir,'trained_results') if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) # assign a 300 dimension vector to each word word_embeddings=data_helper.load_embeddings(vocabulary) embedding_mat=[word_embeddings[word] for index,word in enumerate(vocabulary_inv)] embedding_mat=np.array(embedding_mat, dtype=np.float32) # split the original dataset into trainset and devset x_train, x_dev, y_train, y_dev=train_test_split(x_, y_, test_size=0.1) # split the trainset into trainset and devset logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev))) graph=tf.Graph() with graph.as_default(): session_conf=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess=tf.Session(config=session_conf) with sess.as_default(): cnn_rnn=TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step=tf.Variable(0, name='global_step', trainable=False) optimizer=tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars=optimizer.compute_gradients(cnn_rnn.loss) train_op=optimizer.apply_gradients(grads_and_vars, global_step=global_step) checkpoint_dir=os.path.join(output_dir,'checkpoints') if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix=os.path.join(checkpoint_dir, 'model') def real_len(batches): return [np.ceil(np.argmin(batch+[0])*1.0/params['max_pool_size']) for batch in batches] def train_step(x_batch, y_batch): feed_dict={ cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch) } _, step, loss, accuracy=sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict=feed_dict) def dev_step(x_batch, y_batch): feed_dict={ cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch) } step, loss, accuracy, num_correct, predictions=sess.run([global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict=feed_dict) return accuracy, loss, num_correct, predictions saver=tf.train.Saver() sess.run(tf.global_variables_initializer()) # training starts here train_batches=data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step=0, 0 for train_batch in train_batches: x_train_batch, y_train_batch=zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step=tf.train.global_step(sess, global_step) if current_step%params['evaluate_every']==0: dev_batches=data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct=0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch=zip(*dev_batch) acc, loss, num_dev_correct, predictions=dev_step(x_dev_batch, y_dev_batch) total_dev_correct+=num_dev_correct accuracy=float(total_dev_correct)/len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy>=best_accuracy: best_accuracy, best_at_step=accuracy, current_step path=saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format(path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step)) logging.critical('Training is complete, testing the best model on x_test and y_test') # save trained params and files with open(trained_dir+'/words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir+'/embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir+'/labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length']=x_train.shape[1] with open(trained_dir+'/trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): input_file = sys.argv[1] x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( input_file) training_config = sys.argv[2] params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info("x_train: {}, x_dev: {}, x_test: {}".format( len(x_train), len(x_dev), len(x_test))) logging.info("y_train: {}, y_dev: {}, y_test: {}".format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = "./trained_results_" + timestamp + "/" if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params["non_static"], hidden_unit=params["hidden_unit"], max_pool_size=params["max_pool_size"], filter_sizes=map(int, params["filter_sizes"].split(",")), num_filters=params["num_filters"], embedding_size=params["embedding_dim"], l2_reg_lambda=params["l2_reg_lambda"], ) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = "./checkpoints_" + timestamp + "/" if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, "model") def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params["max_pool_size"]) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params["dropout_keep_prob"], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params["embedding_dim"], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params["embedding_dim"], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run( [ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions, ], feed_dict, ) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params["batch_size"], params["num_epochs"]) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params["evaluate_every"] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params["batch_size"], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info("Accuracy on dev set: {}".format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical("Saved model {} at step {}".format( path, best_at_step)) logging.critical("Best accuracy {} at step {}".format( best_accuracy, best_at_step)) logging.critical( "Training is complete, testing the best model on x_test and y_test" ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + "-" + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params["batch_size"], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical("Accuracy on test set: {}".format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + "words_index.json", "w") as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + "embeddings.pickle", "wb") as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + "labels.json", "w") as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params["sequence_length"] = x_train.shape[1] with open(trained_dir + "trained_parameters.json", "w") as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def train_cnn_rnn(): # TRAIN print("Entering function train_cnn_rnn") x_, y_, vocabulary, vocabulary_inv, df, labels = load_data(TRAIN_FILE_PATH) # Assign a 300 dimension vector to each word # word_embeddings = load_embeddings(vocabulary) # embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)] # embedding_mat = np.array(embedding_mat, dtype=np.float32) # print(len(embedding_mat)) gl_word_to_emb_mat_ind, emb_mat = load_emb(EMB_FILE_PATH) embedding_mat = emb_mat # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) print('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) print('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) if USE_TMP_FOLDER: timestamp = "temp" trained_dir = PRO_FLD + 'trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) # optimizer = tf.train.RMSPropOptimizer(0.001, decay=0.9) optimizer = tf.train.AdamOptimizer(0.001, beta1=0.9, beta2=0.999, epsilon=1e-08) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = PRO_FLD + 'checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, _, l_loss, l_acc = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) return l_loss, l_acc def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } loss_l, accuracy_l, num_correct, predictions_l = sess.run([ cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy_l, loss_l, num_correct, predictions_l def print_stats(stat_dict_total, stat_dict_correct): longest_key = 0 for key in stat_dict_total: if len(key) > longest_key: longest_key = len(key) for key in stat_dict_total: my_msg = " Class {:{}s}: ({}/{}) -> accuracy: {:.4f}%" temp = 0 if key in stat_dict_correct: temp = stat_dict_correct[key] my_acc_l = (float(temp) / float(stat_dict_total[key])) * 100 print( my_msg.format(key, longest_key, temp, stat_dict_total[key], my_acc_l)) return saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step, current_step = 0, 0, 0 trn_loss_over_steps, trn_acc_over_steps, dev_loss_over_steps, dev_acc_over_steps = [], [], [], [] trn_loss_tmp, trn_acc_tmp, dev_loss_tmp, dev_acc_tmp, trn_iters, dev_iters = 0, 0, 0, 0, 0, 0 number_of_steps_in_total = int( (len(x_train) / params['batch_size'] + 1) * params['num_epochs']) # steps print("***There will be {} steps total".format( number_of_steps_in_total)) stat_dict_all_total, stat_dict_all_correct = defaultdict( int), defaultdict(int) # Train the model with x_train and y_train temp_start_time = time.time() # measure epoch time for train_batch in train_batches: stat_dict_step_total, stat_dict_step_correct = defaultdict( int), defaultdict(int) x_train_batch, y_train_batch = zip(*train_batch) step_loss, step_acc = train_step(x_train_batch, y_train_batch) trn_loss_tmp += step_loss trn_acc_tmp += step_acc trn_iters += 1 current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) dev_loss_tmp += loss dev_acc_tmp += acc dev_iters += 1 ind = 0 for p in predictions: real_class_value = int(np.argmax(y_dev_batch[ind])) real_class_label = labels[real_class_value] stat_dict_step_total[real_class_label] += 1 if p == real_class_value: stat_dict_step_correct[real_class_label] += 1 ind += 1 total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) trn_1_eval_loss = float(trn_loss_tmp) / float(trn_iters) trn_1_eval_acc = float(trn_acc_tmp) / float(trn_iters) trn_loss_over_steps.append(trn_1_eval_loss) trn_acc_over_steps.append(trn_1_eval_acc) trn_loss_tmp, trn_acc_tmp, trn_iters = 0, 0, 0 dev_1_eval_loss = float(dev_loss_tmp) / float(dev_iters) dev_1_eval_acc = float(dev_acc_tmp) / float(dev_iters) dev_loss_over_steps.append(dev_1_eval_loss) dev_acc_over_steps.append(dev_1_eval_acc) dev_loss_tmp, dev_acc_tmp, dev_iters = 0, 0, 0 # Stats prints mes = "STEP {} - ({}/{}) -> accuracy: {:.4f}%" print( mes.format(current_step, int(total_dev_correct), len(y_dev), accuracy * 100)) if current_step % PRINT_CLASSES_STATS_EACH_X_STEPS == 0: print_stats(stat_dict_step_total, stat_dict_step_correct) temp_end_time = time.time() - temp_start_time temp_start_time = time.time() # measure epoch time hours, rem = divmod(temp_end_time, 3600) minutes, seconds = divmod(rem, 60) print( "temp run time(from last eval): {:0>2}:{:0>2}:{:0>2}" .format(int(hours), int(minutes), int(seconds))) if accuracy > best_accuracy: best_accuracy, best_at_step = accuracy, current_step if SHOULD_SAVE: path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.info( ' Saved model {} at step {}'.format( path, best_at_step)) msg = ' Best accuracy {:.4f}% at step {}/{} ({}/{})' logging.info( msg.format(best_accuracy * 100, best_at_step, number_of_steps_in_total, int(total_dev_correct), len(y_dev))) stat_dict_all_total = dict( Counter(stat_dict_all_total) + Counter(stat_dict_step_total)) stat_dict_all_correct = dict( Counter(stat_dict_all_correct) + Counter(stat_dict_step_correct)) train_msg = '***Training is complete. Best accuracy {:.4f}% at step {}/{}' print( train_msg.format(best_accuracy * 100, best_at_step, current_step)) # Stats prints print_stats(stat_dict_all_total, stat_dict_all_correct) # Save the model files to trained_dir. predict.py needs trained model files. if SHOULD_SAVE: saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test if RUN_TEST_AFTER_TRAIN: print('***Testing...') saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 test_stat_dict_total, test_dict_correct = defaultdict( int), defaultdict(int) tst_loss_total, tst_iters = 0, 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) tst_loss_total += loss tst_iters += 1 ind = 0 for p in predictions: real_class_value = int(np.argmax(y_test_batch[ind])) real_class_label = labels[real_class_value] test_stat_dict_total[real_class_label] += 1 if p == real_class_value: test_dict_correct[real_class_label] += 1 ind += 1 total_test_correct += int(num_test_correct) my_acc = (float(total_test_correct) / float(len(y_test))) * 100 acc_msg = 'Accuracy on test set - ({}/{}) -> accuracy: {:.4f}%' tst_loss_total = float(tst_loss_total) / float(tst_iters) tst_loss_total, tst_acc_total = [ tst_loss_total ] * len(dev_acc_over_steps), [ (float(total_test_correct) / float(len(y_test))) ] * len(dev_acc_over_steps) print(acc_msg.format(total_test_correct, len(y_test), my_acc)) print_graph( 'loss/epochs(train in red, validation in green, test(constant) in blue)', 'epochs', 'loss', trn_loss_over_steps, dev_loss_over_steps, tst_loss_total) print_graph( 'acc/epochs(train in red, validation in green, test(constant) in blue)', 'epochs', 'acc', trn_acc_over_steps, dev_acc_over_steps, tst_acc_total) # Stats prints print_stats(test_stat_dict_total, test_dict_correct) if PRINT_WORD_PARAGRAPH: mdiff = 'data file={}. us and spain 45-150 tokens. BasicLSTMCell'.format( CSV_FULL_PATH) last_out = 7 print('Difference from out{}: {}'.format(last_out, mdiff)) m1 = 'Training best acc {:.4f}% at step {}/{}' print( m1.format(best_accuracy * 100, best_at_step, current_step)) m2 = 'Test results: Accuracy on test set - ({}/{}) -> accuracy: {:.4f}%' print(m2.format(total_test_correct, len(y_test), my_acc)) print_stats(test_stat_dict_total, test_dict_correct) # # Save trained parameters and files since predict.py needs them # with open(trained_dir + 'words_index.json', 'w') as outfile: # json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) # with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: # pickle.dump(embedding_mat, outfile) # with open(trained_dir + 'labels.json', 'w') as outfile: # json.dump(labels, outfile, indent=4, ensure_ascii=False) # # params['sequence_length'] = x_train.shape[1] # with open(trained_dir + 'trained_parameters.json', 'w') as outfile: # json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False) print("Leaving function train_cnn_rnn") return
def predict_cnn_rnn(x_test, y_test, out_dir='trained_results'): ################################################################### # ARG : x_test/y_test (test data/label matrix) # # # # RETURN : predict_labels, accuracy # # (predict label vector) # # # # AT predicted_dir, there is predict_labels.txt # # which stores the prediction result of test file # # # ################################################################### if out_dir == '': trained_dir = 'trained_results' else: trained_dir = out_dir with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): params = json.loads( open(trained_dir + 'trained_parameters.json', encoding='utf-8').read()) words_index = json.loads( open(trained_dir + 'words_index.json', encoding='utf-8').read()) labels = json.loads( open(trained_dir + 'labels.json', encoding='utf-8').read()) with open(trained_dir + 'embeddings.pickle', 'rb') as input_file: fetched_embedding = pickle.load(input_file) embedding_mat = np.array(fetched_embedding, dtype=np.float32) cnn_rnn2 = TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=len(x_test[0]), max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], num_classes=len(labels), embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def predict_step(x_batch): feed_dict = { cnn_rnn2.input_x: x_batch, cnn_rnn2.dropout_keep_prob: 1.0, cnn_rnn2.batch_size: len(x_batch), cnn_rnn2.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn2.real_len: real_len(x_batch), } predictions = sess.run([cnn_rnn2.predictions], feed_dict) return predictions checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.train.Saver(tf.global_variables()) saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False) predictions, predict_labels = [], [] for x_test_batch in batches: batch_predictions = predict_step(x_test_batch)[0] for batch_prediction in batch_predictions: predictions.append(batch_prediction) predict_labels.append(labels[batch_prediction]) if y_test is not None: y_test = np.array(np.argmax(y_test, axis=1)) accuracy = sum(np.array(predictions) == y_test) / float( len(y_test)) logging.critical( 'The prediction accuracy is: {}'.format(accuracy)) logging.critical('Prediction is complete') return predict_labels, accuracy
def train(x_train, y_train, vocab_processor, x_dev, y_dev, x_real_len_train, x_real_len_dev, sorted_label): # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): if FLAGS.model_type == "cnnrnn": obj = TextCNNRNN(sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), hidden_unit=FLAGS.hidden_unit, embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) elif FLAGS.model_type == "rnncnn": obj = TextRNNCNN(sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), hidden_unit=FLAGS.hidden_unit, embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) elif FLAGS.model_type == "rnnandcnn": obj = TextRNNandCNN( sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), hidden_unit=FLAGS.hidden_unit, embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) elif FLAGS.model_type == "rnn": obj = TextRNN(sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), hidden_unit=FLAGS.hidden_unit, embedding_size=FLAGS.embedding_dim, l2_reg_lambda=FLAGS.l2_reg_lambda) elif FLAGS.model_type == "dan": obj = TextDAN(sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) elif FLAGS.model_type == "attn_cnn": obj = TextAttnCNN(sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, num_heads=FLAGS.num_heads, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) elif FLAGS.model_type == "dpcnn": obj = TextDPCNN(sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, num_blocks=FLAGS.num_blocks, l2_reg_lambda=FLAGS.l2_reg_lambda) else: obj = TextCNN(sequence_length=FLAGS.max_document_length, num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): grads_and_vars = optimizer.compute_gradients(obj.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", FLAGS.model_version)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", obj.loss) acc_summary = tf.summary.scalar("accuracy", obj.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Save train params since eval.py needs them trained_dir = os.path.abspath( os.path.join(out_dir, "trained_results")) if not os.path.exists(trained_dir): os.makedirs(trained_dir) with open(trained_dir + '/sorted_label.json', 'w') as outfile: json.dump(sorted_label, outfile, indent=4, ensure_ascii=False) with open(trained_dir + '/train_params.json', 'w') as outfile: json.dump({"max_document_length": FLAGS.max_document_length}, outfile, indent=4, ensure_ascii=False) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch, x_real_len_batch): """ A single training step """ if FLAGS.model_type == "cnn" or FLAGS.model_type == "dan" or FLAGS.model_type == "attn_cnn" or FLAGS.model_type == "dpcnn": feed_dict = { obj.input_x: x_batch, obj.input_y: y_batch, obj.dropout_keep_prob: FLAGS.dropout_keep_prob, obj.is_training: True } else: feed_dict = { obj.input_x: x_batch, obj.input_y: y_batch, obj.dropout_keep_prob: FLAGS.dropout_keep_prob, obj.real_len: x_real_len_batch } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, obj.loss, obj.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def overfit(dev_loss, eva_num=3): n = len(dev_loss) if n < eva_num: return False for i in xrange(n - eva_num + 1, n): if dev_loss[i] > dev_loss[i - 1]: return False return True def dev_step(x_batch, y_batch, x_real_len_batch, writer=None): """ Evaluates model on a dev set """ dev_batches = data_helpers.batch_iter(list( zip(x_batch, y_batch, x_real_len_batch)), FLAGS.batch_size, 1, shuffle=False) all_pred = [] correct_total_num = 0 for batch in dev_batches: x_dev_batch, y_dev_batch, x_real_len_dev_batch = zip( *batch) if FLAGS.model_type == "cnn" or FLAGS.model_type == "dan" or FLAGS.model_type == "attn_cnn" or FLAGS.model_type == "dpcnn": feed_dict = { obj.input_x: x_dev_batch, obj.input_y: y_dev_batch, obj.dropout_keep_prob: 1.0, obj.is_training: False } else: feed_dict = { obj.input_x: x_dev_batch, obj.input_y: y_dev_batch, obj.dropout_keep_prob: 1.0, obj.real_len: x_real_len_dev_batch } step, summaries, pred, correct_pred_num = sess.run([ global_step, dev_summary_op, obj.predictions, obj.correct_pred_num ], feed_dict) all_pred = np.concatenate([all_pred, pred]) correct_total_num += correct_pred_num if writer: writer.add_summary(summaries, step) dev_acc = 1.0 * correct_total_num / len(y_batch) print("right_sample {}, dev_sample {}, dev_acc {:g}".format( correct_total_num, len(y_batch), dev_acc)) return dev_acc # Generate batches batches = data_helpers.batch_iter( list(zip(x_train, y_train, x_real_len_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... dev_acc = [] for batch in batches: x_batch, y_batch, x_real_len_batch = zip(*batch) train_step(x_batch, y_batch, x_real_len_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:", current_step) cur_acc = dev_step(x_dev, y_dev, x_real_len_dev, writer=dev_summary_writer) path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path)) dev_acc.append(cur_acc) if overfit(dev_acc): print("current accuracy drop and stop train..\n") sys.exit(0) print("")
def train_cnn_rnn(embedding_mat, embedding_pre, x_train, x_dev, y_train, y_dev, pre_y_train, pre_y_dev, labels, vocabulary, out_dir='./trained_results/'): if out_dir == '': trained_dir = './trained_results/' else: trained_dir = out_dir params = { "batch_size": 128, "dropout_keep_prob": 0.5, "embedding_dim": 64, "evaluate_every": 500, "filter_sizes": "3,4,5", "hidden_unit": 64, "l2_reg_lambda": 0.0, "max_pool_size": 4, "non_static": True, "num_epochs": 100, "num_filters": 32, "attention_size": 66 } graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, embedding_pre=embedding_pre, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch, pre_y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.input_pre_y: pre_y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy, embedding_mat = sess.run([ train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.Word ], feed_dict) return embedding_mat def dev_step(x_batch, y_batch, pre_y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.input_pre_y: pre_y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter( list(zip(x_train, y_train, pre_y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch, pre_y_train_batch = zip( *train_batch) embedding_mat = train_step(x_train_batch, y_train_batch, pre_y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev, pre_y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch, pre_y_dev_batch = zip( *dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch, pre_y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to out_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") with open(trained_dir + 'words_index.json', 'w', encoding='utf-8') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embedding_mat.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'embedding_pre.pickle', 'wb') as outfile: pickle.dump(embedding_pre, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w', encoding='utf-8') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w', encoding='utf-8') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
def predict_unseen_data(): trained_dir = sys.argv[1] if not trained_dir.endswith('/'): trained_dir += '/' test_file = sys.argv[2] params, words_index, labels, embedding_mat = load_trained_params( trained_dir) x_, y_, df = load_test_data(test_file, labels) x_ = data_helper.pad_sentences( x_, forced_sequence_length=params['sequence_length']) x_ = map_word_to_index(x_, words_index) x_test, y_test = np.asarray(x_), None if y_ is not None: y_test = np.asarray(y_) timestamp = trained_dir.split('/')[-2].split('_')[-1] predicted_dir = './predicted_results_' + timestamp + '/' if os.path.exists(predicted_dir): shutil.rmtree(predicted_dir) os.makedirs(predicted_dir) with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=len(x_test[0]), max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], num_classes=len(labels), embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def predict_step(x_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } predictions = sess.run([cnn_rnn.predictions], feed_dict) return predictions checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.train.Saver(tf.all_variables()) saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False) predictions, predict_labels = [], [] for x_batch in batches: batch_predictions = predict_step(x_batch)[0] for batch_prediction in batch_predictions: predictions.append(batch_prediction) predict_labels.append(labels[batch_prediction]) df['PREDICTED'] = predict_labels columns = sorted(df.columns, reverse=True) df.to_csv(predicted_dir + 'predictions_all.csv', index=False, columns=columns, sep='|') if y_test is not None: y_test = np.array(np.argmax(y_test, axis=1)) accuracy = sum(np.array(predictions) == y_test) / float( len(y_test)) logging.critical( 'The prediction accuracy is: {}'.format(accuracy)) logging.critical( 'Prediction is complete, all files have been saved: {}'.format( predicted_dir))
def train_model(): datafile = 'toxic_comments.csv' # x_:数据集, y_: 标签, vocabulary: 单词及标号, vocabulary_inv: , df: pandas数据, labels标签 x_, y_, vocabulary, vocabulary_inv, df, labels = Data_preprocess.load_data( datafile) params = { "batch_size": 16, "dropout_keep_prob": 0.5, "embedding_dim": len(vocabulary), "evaluate_every": 200, "filter_sizes": "3,4,5", "hidden_unit": 300, "l2_reg_lambda": 0.0, "max_pool_size": 4, "non_static": False, "num_epochs": 128, "num_filters": 32 } ## "num_epochs": 1-->128 # Assign a 149998 dimension vector to each word. word_embeddings = Data_preprocess.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set # 将数据分为训练集和测试机 x, x_test, y, y_test = train_test_split( x_, y_, test_size=0.1) # 这里的y_只是某一类的y,这里是第一类toxic # Split the train set into train set and dev set # 将训练数据又分为训练集和验证集 x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) print('x_train', x_train) print('y_train', y_train) print('y_train.shape', y_train.shape[1]) #i = input() logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN( embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], ############## non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) ## 优化器 grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training # 存储训练模型 checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here 开始训练 train_batches = Data_preprocess.batch_iter( list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = Data_preprocess.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = Data_preprocess.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)