def do_eval(sess, train_q, train_a, train_lab): train_correct = 0.0 # number_examples = len(train_q) # print("valid examples:", number_examples) eval_loss, eval_accc, eval_counter = 0.0, 0.0, 0 eval_true_positive, eval_false_positive, eval_true_negative, eval_false_negative = 0, 0, 0, 0 # batch_size = 1 weights_label = {} # weight_label[label_index]=(number,correct) weights = np.ones((opt.batch_size)) kf_train = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents_1 = [train_q[t] for t in train_index] train_sents_2 = [train_a[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels_array = np.array(train_labels) # print("train_labels", train_labels.shape) # train_labels = train_labels.reshape((len(train_labels), opt.category)) train_labels = np.eye(opt.category)[train_labels_array] x_train_batch_1, x_train_mask_1 = prepare_data_for_emb( train_sents_1, opt) x_train_batch_2, x_train_mask_2 = prepare_data_for_emb( train_sents_2, opt) curr_eval_loss, curr_accc, logits = sess.run( [loss_, accuracy_, logits_], feed_dict={ x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2, y_: train_labels, opt.weights_label: weights, keep_prob: 1.0 }) true_positive, false_positive, true_negative, false_negative = compute_confuse_matrix( logits, train_labels ) # logits:[batch_size,label_size]-->logits[0]:[label_size] # write_predict_error_to_file(start,file_object,logits[0], evalY[start:end][0],vocabulary_index2word,evalX1[start:end],evalX2[start:end]) eval_loss, eval_accc, eval_counter = eval_loss + curr_eval_loss, eval_accc + curr_accc, eval_counter + 1 # 注意这里计算loss和accc的方法,计算累加值,然后归一化 weights_label = compute_labels_weights( weights_label, logits, train_labels_array ) # compute_labels_weights(weights_label,logits,labels) eval_true_positive, eval_false_positive = eval_true_positive + true_positive, eval_false_positive + false_positive eval_true_negative, eval_false_negative = eval_true_negative + true_negative, eval_false_negative + false_negative # weights_label = compute_labels_weights(weights_label, logits, evalY[start:end]) #compute_labels_weights(weights_label,logits,labels) print("true_positive:", eval_true_positive, ";false_positive:", eval_false_positive, ";true_negative:", eval_true_negative, ";false_negative:", eval_false_negative) p = float(eval_true_positive) / float(eval_true_positive + eval_false_positive) r = float(eval_true_positive) / float(eval_true_positive + eval_false_negative) f1_score = (2 * p * r) / (p + r) print("eval_counter:", eval_counter, ";eval_acc:", eval_accc) return eval_loss / float(eval_counter), eval_accc / float( eval_counter), f1_score, p, r, weights_label
def predict(args): print("loading model") model = LEAM(args) model.eval() checkpoint = torch.load(args.save_path) model.load_state_dict(checkpoint['model_state_dict']) predicted_sents = sys.stdin.readlines() class_name = ['pos', 'neg'] for sent in predicted_sents: temp = list(jieba.cut(sent.strip('\n'), cut_all=False)) if not temp: sys.stdout.write('empty review cannot be predicted\n') x = [temp] x = convert_word2idx(x, args.word2idx) x, x_mask = prepare_data_for_emb(x, args) logits, _, beta = model(x, x_mask) if args.predict == 'key_words': n = len(temp) beta = np.array(beta.squeeze().data) key_words = [] idx = beta.argsort() num_key = min(n // 5, 5) num_key = max(num_key, 1) count = 0 j = 1 while(count < num_key and j <= len(beta)): if idx[-j] < n: key_words.append(temp[idx[-j]]) count += 1 j += 1 sys.stdout.write(' '.join(key_words) + '\n') elif args.predict == 'pos': class_x = torch.max(logits, 1)[1] sys.stdout.write(class_name[class_x[0]] + '\n') else: print("oops, mistake!")
def main(): # global n_words # Prepare training and testing data loadpath = "./data/yahoo.p" x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] train_lab, val_lab, test_lab = x[3], x[4], x[5] wordtoix, ixtoword = x[6], x[7] train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt = Options() opt.n_words = len(ixtoword) del x print(dict(opt)) print('Total words: %d' % opt.n_words) if opt.part_data: np.random.seed(123) train_ind = np.random.choice(len(train), int(len(train) * opt.portion), replace=False) train = [train[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] try: params = np.load('./param_g.npz') if params['Wemb'].shape == (opt.n_words, opt.embed_size): print('Use saved embedding.') opt.W_emb = params['Wemb'] else: print('Emb Dimension mismatch: param_g.npz:' + str(params['Wemb'].shape) + ' opt: ' + str((opt.n_words, opt.embed_size))) opt.fix_emb = False except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:1'): x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_mask_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) keep_prob = tf.placeholder(tf.float32) y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, 10]) accuracy_, loss_, train_op, W_emb_ = emb_classifier( x_, x_mask_, y_, keep_prob, opt) # merged = tf.summary.merge_all() uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.trainable_variables() # print([var.name[:-2] for var in t_vars]) save_keys = tensors_key_in_file(opt.save_path) # print(save_keys.keys()) ss = set([var.name for var in t_vars]) & set( [s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} # only restore variables with correct shape ss_right_shape = set( [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[ var for var in t_vars if var.name in ss_right_shape ]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = np.array(x_labels) x_labels = x_labels.reshape((len(x_labels), 10)) x_batch, x_batch_mask = prepare_data_for_emb(sents, opt) _, loss = sess.run( [train_op, loss_], feed_dict={ x_: x_batch, x_mask_: x_batch_mask, y_: x_labels, keep_prob: opt.drop_rate }) if uidx % opt.valid_freq == 0: train_correct = 0.0 kf_train = get_minibatches_idx(500, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents = [train[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels.reshape( (len(train_labels), 10)) x_train_batch, x_train_batch_mask = prepare_data_for_emb( train_sents, opt) # Batch L train_accuracy = sess.run(accuracy_, feed_dict={ x_: x_train_batch, x_mask_: x_train_batch_mask, y_: train_labels, keep_prob: 1.0 }) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 500 print("Iteration %d: Training loss %f " % (uidx, loss)) print("Train accuracy %f " % train_accuracy) val_correct = 0.0 kf_val = get_minibatches_idx(20000, opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents = [val[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels.reshape( (len(val_labels), 10)) x_val_batch, x_val_batch_mask = prepare_data_for_emb( val_sents, opt) val_accuracy = sess.run(accuracy_, feed_dict={ x_: x_val_batch, x_mask_: x_val_batch_mask, y_: val_labels, keep_prob: 1.0 }) val_correct += val_accuracy * len(val_index) val_accuracy = val_correct / 20000 print("Validation accuracy %f " % val_accuracy) if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy test_correct = 0.0 kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True) for _, test_index in kf_test: test_sents = [test[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels.reshape( (len(test_labels), 10)) x_test_batch, x_test_batch_mask = prepare_data_for_emb( test_sents, opt) test_accuracy = sess.run(accuracy_, feed_dict={ x_: x_test_batch, x_mask_: x_test_batch_mask, y_: test_labels, keep_prob: 1.0 }) test_correct += test_accuracy * len(test_index) test_accuracy = test_correct / len(test) print("Test accuracy %f " % test_accuracy) max_test_accuracy = test_accuracy print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) emb = sess.run(W_emb_, feed_dict={x_: x_test_batch}) cPickle.dump([emb], open("yahoo_emb_max_300.p", "wb")) print("Max Test accuracy %f " % max_test_accuracy) except KeyboardInterrupt: # print 'Training interupted' print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def main(): # Prepare training and testing data opt = Options() # load data loadpath = "./data/mimic3.p" embpath = "mimic3_emb.p" opt.num_class = 50 x = cPickle.load(open(loadpath, "rb")) train, train_text, train_lab = x[0], x[1], x[2] val, val_text, val_lab = x[3], x[4], x[5] test, test_text, test_lab = x[6], x[7], x[8] wordtoix, ixtoword = x[10], x[9] del x print("load data finished") train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt.n_words = len(ixtoword) if opt.part_data: #np.random.seed(123) train_ind = np.random.choice(len(train), int(len(train) * opt.portion), replace=False) train = [train[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID) print(dict(opt)) print('Total words: %d' % opt.n_words) try: opt.W_emb = np.array(cPickle.load(open(embpath, 'rb')), dtype='float32') opt.W_class_emb = load_class_embedding(wordtoix, opt) except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:1'): x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen], name='x_') x_mask_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen], name='x_mask_') keep_prob = tf.placeholder(tf.float32, name='keep_prob') y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.num_class], name='y_') class_penalty_ = tf.placeholder(tf.float32, shape=()) accuracy_, loss_, train_op, W_norm_, global_step, logits_, prob_ = emb_classifier( x_, x_mask_, y_, keep_prob, opt, class_penalty_) uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. max_val_auc_mean = 0. max_test_auc_mean = 0. config = tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, ) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.trainable_variables() save_keys = tensors_key_in_file(opt.save_path) ss = set([var.name for var in t_vars]) & set( [s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} # only restore variables with correct shape ss_right_shape = set( [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[ var for var in t_vars if var.name in ss_right_shape ]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = np.array(x_labels) x_labels = x_labels.reshape((len(x_labels), opt.num_class)) x_batch, x_batch_mask = prepare_data_for_emb(sents, opt) _, loss, step, = sess.run( [train_op, loss_, global_step], feed_dict={ x_: x_batch, x_mask_: x_batch_mask, y_: x_labels, keep_prob: opt.dropout, class_penalty_: opt.class_penalty }) if uidx % opt.valid_freq == 0: train_correct = 0.0 # sample evaluate accuaccy on 500 sample data kf_train = get_minibatches_idx(500, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents = [train[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels.reshape( (len(train_labels), opt.num_class)) x_train_batch, x_train_batch_mask = prepare_data_for_emb( train_sents, opt) train_accuracy = sess.run(accuracy_, feed_dict={ x_: x_train_batch, x_mask_: x_train_batch_mask, y_: train_labels, keep_prob: 1.0, class_penalty_: 0.0 }) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 500 print("Iteration %d: Training loss %f " % (uidx, loss)) print("Train accuracy %f " % train_accuracy) val_correct = 0.0 val_y = [] val_logits_list = [] val_prob_list = [] val_true_list = [] kf_val = get_minibatches_idx(len(val), opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents = [val[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels.reshape( (len(val_labels), opt.num_class)) x_val_batch, x_val_batch_mask = prepare_data_for_emb( val_sents, opt) val_accuracy, val_logits, val_probs = sess.run( [accuracy_, logits_, prob_], feed_dict={ x_: x_val_batch, x_mask_: x_val_batch_mask, y_: val_labels, keep_prob: 1.0, class_penalty_: 0.0 }) val_correct += val_accuracy * len(val_index) val_y += np.argmax(val_labels, axis=1).tolist() val_logits_list += val_logits.tolist() val_prob_list += val_probs.tolist() val_true_list += val_labels.tolist() val_accuracy = val_correct / len(val) val_logits_array = np.asarray(val_logits_list) val_prob_array = np.asarray(val_prob_list) val_true_array = np.asarray(val_true_list) val_auc_list = [] val_auc_micro = roc_auc_score(y_true=val_true_array, y_score=val_logits_array, average='micro') val_auc_macro = roc_auc_score(y_true=val_true_array, y_score=val_logits_array, average='macro') for i in range(opt.num_class): if np.max(val_true_array[:, i] > 0): val_auc = roc_auc_score( y_true=val_true_array[:, i], y_score=val_logits_array[:, i], ) val_auc_list.append(val_auc) val_auc_mean = np.mean(val_auc) # print("Validation accuracy %f " % val_accuracy) print("val auc macro %f micro %f " % (val_auc_macro, val_auc_micro)) if True: test_correct = 0.0 test_y = [] test_logits_list = [] test_prob_list = [] test_true_list = [] kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True) for _, test_index in kf_test: test_sents = [test[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels.reshape( (len(test_labels), opt.num_class)) x_test_batch, x_test_batch_mask = prepare_data_for_emb( test_sents, opt) test_accuracy, test_logits, test_probs = sess.run( [accuracy_, logits_, prob_], feed_dict={ x_: x_test_batch, x_mask_: x_test_batch_mask, y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0 }) test_correct += test_accuracy * len(test_index) test_correct += test_accuracy * len(test_index) test_y += np.argmax(test_labels, axis=1).tolist() test_logits_list += test_logits.tolist() test_prob_list += test_probs.tolist() test_true_list += test_labels.tolist() test_accuracy = test_correct / len(test) test_logits_array = np.asarray(test_logits_list) test_prob_array = np.asarray(test_prob_list) test_true_array = np.asarray(test_true_list) test_auc_list = [] test_auc_micro = roc_auc_score( y_true=test_true_array, y_score=test_logits_array, average='micro') test_auc_macro = roc_auc_score( y_true=test_true_array, y_score=test_logits_array, average='macro') test_f1_micro = micro_f1( test_prob_array.ravel() > 0.5, test_true_array.ravel(), ) test_f1_macro = macro_f1( test_prob_array > 0.5, test_true_array, ) test_p5 = precision_at_k(test_logits_array, test_true_array, 5) for i in range(opt.num_class): if np.max(test_true_array[:, i] > 0): test_auc = roc_auc_score( y_true=test_true_array[:, i], y_score=test_logits_array[:, i], ) test_auc_list.append(test_auc) test_auc_mean = np.mean(test_auc) print("Test auc macro %f micro %f " % (test_auc_macro, test_auc_micro)) print("Test f1 macro %f micro %f " % (test_f1_macro, test_f1_micro)) print("P5 %f" % test_p5) # max_test_accuracy = test_accuracy max_test_auc_mean = test_auc_mean # print("Test accuracy %f " % test_accuracy) # max_test_accuracy = test_accuracy # print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) print("Epoch %d: Max Test auc %f" % (epoch, max_test_auc_mean)) saver.save(sess, opt.save_path, global_step=epoch) print("Max Test accuracy %f " % max_test_accuracy) except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def main(): # Prepare training and testing data opt = Options() # load data if opt.dataset == 'Tweet': loadpath = "./data/langdetect_tweet0.7.p" embpath = "./data/langdetect_tweet_emb.p" opt.num_class = 4 opt.class_name = ['apple', 'google', 'microsoft', 'twitter'] if opt.dataset == 'N20short': loadpath = "./data/N20short.p" embpath = "./data/N20short_emb.p" opt.class_name = [ 'rec.autos', 'talk.politics.misc', 'sci.electronics', 'comp.sys.ibm.pc.hardware', 'talk.politics.guns', 'sci.med', 'rec.motorcycles', 'soc.religion.christian', 'comp.sys.mac.hardware', 'comp.graphics', 'sci.space', 'alt.atheism', 'rec.sport.baseball', 'comp.windows.x', 'talk.religion.misc', 'comp.os.ms-windows.misc', 'misc.forsale', 'talk.politics.mideast', 'sci.crypt', 'rec.sport.hockey' ] opt.num_class = len(opt.class_name) elif opt.dataset == 'agnews': loadpath = "./data/ag_news.p" embpath = "./data/ag_news_glove.p" opt.num_class = 4 opt.class_name = ['World', 'Sports', 'Business', 'Science'] elif opt.dataset == 'dbpedia': loadpath = "./data/dbpedia.p" embpath = "./data/dbpedia_glove.p" opt.num_class = 14 opt.class_name = [ 'Company', 'Educational Institution', 'Artist', 'Athlete', 'Office Holder', 'Mean Of Transportation', 'Building', 'Natural Place', 'Village', 'Animal', 'Plant', 'Album', 'Film', 'Written Work', ] elif opt.dataset == 'yelp_full': loadpath = "./data/yelp_full.p" embpath = "./data/yelp_full_glove.p" opt.num_class = 5 opt.class_name = ['worst', 'bad', 'middle', 'good', 'best'] x = cPickle.load(open(loadpath, "rb"), encoding='iso-8859-1') train, val, test = x[0], x[1], x[2] print(len(val)) train_lab, val_lab, test_lab = x[3], x[4], x[5] wordtoix, ixtoword = x[6], x[7] del x print("len of train,val,test:", len(train), len(val), len(test)) print("load data finished") train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt.n_words = len(ixtoword) if opt.part_data: #np.random.seed(123) train_ind = np.random.choice(len(train), int(len(train) * opt.portion), replace=False) train = [train[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID) print(dict(opt)) print('Total words: %d' % opt.n_words) try: opt.W_emb = np.array(cPickle.load(open(embpath, 'rb'), encoding='iso-8859-1'), dtype='float32') opt.W_class_emb = load_class_embedding(wordtoix, opt) except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/cpu:0'): x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen], name='x_') x_mask_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen], name='x_mask_') keep_prob = tf.placeholder(tf.float32, name='keep_prob') y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.num_class], name='y_') class_penalty_ = tf.placeholder(tf.float32, shape=()) accuracy_, loss_, train_op, W_norm_, global_step, prob_ = emb_classifier( x_, x_mask_, y_, keep_prob, opt, class_penalty_) uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. config = tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, ) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.trainable_variables() save_keys = tensors_key_in_file(opt.save_path) ss = set([var.name for var in t_vars]) & set( [s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} # only restore variables with correct shape ss_right_shape = set( [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[ var for var in t_vars if var.name in ss_right_shape ]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] # print(x_labels) x_labels = np.array(x_labels) x_labels = x_labels.reshape((len(x_labels), opt.num_class)) # print(x_labels) # exit() x_batch, x_batch_mask = prepare_data_for_emb(sents, opt) _, loss, step, = sess.run( [train_op, loss_, global_step], feed_dict={ x_: x_batch, x_mask_: x_batch_mask, y_: x_labels, keep_prob: opt.dropout, class_penalty_: opt.class_penalty }) if uidx % opt.valid_freq == 0: train_correct = 0.0 # sample evaluate accuaccy on 500 sample data kf_train = get_minibatches_idx(500, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents = [train[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels.reshape( (len(train_labels), opt.num_class)) x_train_batch, x_train_batch_mask = prepare_data_for_emb( train_sents, opt) train_accuracy = sess.run(accuracy_, feed_dict={ x_: x_train_batch, x_mask_: x_train_batch_mask, y_: train_labels, keep_prob: 1.0, class_penalty_: 0.0 }) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 500 print("Iteration %d: Training loss %f " % (uidx, loss)) print("Train accuracy %f " % train_accuracy) val_correct = 0.0 kf_val = get_minibatches_idx(len(val), opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents = [val[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels.reshape( (len(val_labels), opt.num_class)) x_val_batch, x_val_batch_mask = prepare_data_for_emb( val_sents, opt) val_accuracy = sess.run(accuracy_, feed_dict={ x_: x_val_batch, x_mask_: x_val_batch_mask, y_: val_labels, keep_prob: 1.0, class_penalty_: 0.0 }) val_correct += val_accuracy * len(val_index) val_accuracy = val_correct / len(val) print("Validation accuracy %f " % val_accuracy) if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy # test_correct = 0.0 # # kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True) # for _, test_index in kf_test: # test_sents = [test[t] for t in test_index] # test_labels = [test_lab[t] for t in test_index] # test_labels = np.array(test_labels) # test_labels = test_labels.reshape((len(test_labels), opt.num_class)) # x_test_batch, x_test_batch_mask = prepare_data_for_emb(test_sents, opt) # # test_accuracy,predict_prob = sess.run([accuracy_,prob_],feed_dict={x_: x_test_batch, x_mask_: x_test_batch_mask,y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0}) # print(predict_prob) # test_correct += test_accuracy * len(test_index) # # test_accuracy = test_correct / len(test) # print("Test accuracy %f " % test_accuracy) # max_test_accuracy = test_accuracy # print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) saver.save(sess, opt.save_path, global_step=epoch) saver.save(sess, "save_model/model.ckpt") # print("Max Test accuracy %f " % max_test_accuracy) test_correct = 0.0 kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=False) for _, test_index in kf_test: test_sents = [test[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels.reshape( (len(test_labels), opt.num_class)) x_test_batch, x_test_batch_mask = prepare_data_for_emb( test_sents, opt) test_accuracy, predict_prob = sess.run( [accuracy_, prob_], feed_dict={ x_: x_test_batch, x_mask_: x_test_batch_mask, y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0 }) for prob in predict_prob: topnlabel_onedoc = [0] * opt.num_class for iter_topnlabel in range(opt.topnlabel): index_label = np.argwhere(prob == max(prob)) topnlabel_onedoc[index_label[0] [0]] = prob[index_label][0][0] prob[index_label] = -1 topnlabel_docwithoutlabel.append(topnlabel_onedoc) test_correct += test_accuracy * len(test_index) print(topnlabel_docwithoutlabel) test_accuracy = test_correct / len(test) print("Predict accuracy %f " % test_accuracy) max_test_accuracy = test_accuracy filename = 'test' file = open(filename, 'w') file.write(str(len(test))) file.write('\n') # print(wordtoix.get('close')) # exit() for topic_prob in topnlabel_docwithoutlabel: print(topic_prob) for prob_each_label in topic_prob: file.write(str(prob_each_label)) file.write(" ") file.write('\n') except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def main(): # Prepare training and testing data opt = Options() # load data if opt.dataset == 'yahoo': loadpath = "./data/yahoo.p" embpath = "./data/yahoo_glove.p" opt.num_class = 10 opt.class_name = [ 'Society Culture', 'Science Mathematics', 'Health', 'Education Reference', 'Computers Internet', 'Sports', 'Business Finance', 'Entertainment Music', 'Family Relationships', 'Politics Government' ] elif opt.dataset == 'agnews': loadpath = "./data/ag_news.p" embpath = "./data/ag_news_glove.p" opt.num_class = 4 opt.class_name = ['World', 'Sports', 'Business', 'Science'] elif opt.dataset == 'dbpedia': loadpath = "./data/dbpedia.p" embpath = "./data/dbpedia_glove.p" opt.num_class = 14 opt.class_name = [ 'Company', 'Educational Institution', 'Artist', 'Athlete', 'Office Holder', 'Mean Of Transportation', 'Building', 'Natural Place', 'Village', 'Animal', 'Plant', 'Album', 'Film', 'Written Work', ] elif opt.dataset == 'yelp_full': loadpath = "./data/yelp_full.p" embpath = "./data/yelp_full_glove.p" opt.num_class = 5 opt.class_name = ['worst', 'bad', 'middle', 'good', 'best'] elif opt.dataset == 'Positive': loadpath = "./data/Positive.p" embpath = "./data/glove.p" opt.num_class = 2 opt.class_name = ['Control', 'Other'] elif opt.dataset == 'Negative': loadpath = "./data/Negative.p" embpath = "./data/glove.p" opt.num_class = 2 opt.class_name = ['Control', 'Other'] x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] train_lab, val_lab, test_lab = x[3], x[4], x[5] wordtoix, ixtoword = x[6], x[7] del x print("load data finished") train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt.n_words = len(ixtoword) if opt.part_data: #np.random.seed(123) train_ind = np.random.choice(len(train), int(len(train) * opt.portion), replace=False) train = [train[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID) print(dict(opt)) print('Total words: %d' % opt.n_words) try: opt.W_emb = np.array(cPickle.load(open(embpath, 'rb')), dtype='float32') #opt.W_class_emb = load_class_embedding( wordtoix, opt) except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:1'): x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen], name='x_') x_mask_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen], name='x_mask_') keep_prob = tf.placeholder(tf.float32, name='keep_prob') y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.num_class], name='y_') class_penalty_ = tf.placeholder(tf.float32, shape=()) accuracy_, loss_, train_op, W_norm_, W_class, global_step = emb_classifier( x_, x_mask_, y_, keep_prob, opt, class_penalty_) uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. config = tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, ) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.trainable_variables() save_keys = tensors_key_in_file(opt.save_path) ss = set([var.name for var in t_vars]) & set( [s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} # only restore variables with correct shape ss_right_shape = set( [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[ var for var in t_vars if var.name in ss_right_shape ]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = np.array(x_labels) x_labels = x_labels.reshape((len(x_labels), opt.num_class)) x_batch, x_batch_mask = prepare_data_for_emb(sents, opt) _, loss, step, = sess.run( [train_op, loss_, global_step], feed_dict={ x_: x_batch, x_mask_: x_batch_mask, y_: x_labels, keep_prob: opt.dropout, class_penalty_: opt.class_penalty }) if uidx % opt.valid_freq == 0: train_correct = 0.0 # sample evaluate accuaccy on 500 sample data kf_train = get_minibatches_idx(50, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents = [train[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels.reshape( (len(train_labels), opt.num_class)) x_train_batch, x_train_batch_mask = prepare_data_for_emb( train_sents, opt) train_accuracy = sess.run(accuracy_, feed_dict={ x_: x_train_batch, x_mask_: x_train_batch_mask, y_: train_labels, keep_prob: 1.0, class_penalty_: 0.0 }) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 50 with open("weights.pkl", "wb") as handle: pickle.dump(sess.run(W_norm_), handle) pickle.dump(sess.run(W_class), handle) print("Iteration %d: Training loss %f " % (uidx, loss)) print("Train accuracy %f " % train_accuracy) val_correct = 0.0 kf_val = get_minibatches_idx(len(val), opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents = [val[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels.reshape( (len(val_labels), opt.num_class)) x_val_batch, x_val_batch_mask = prepare_data_for_emb( val_sents, opt) val_accuracy = sess.run(accuracy_, feed_dict={ x_: x_val_batch, x_mask_: x_val_batch_mask, y_: val_labels, keep_prob: 1.0, class_penalty_: 0.0 }) val_correct += val_accuracy * len(val_index) val_accuracy = val_correct / (len(val) + 0.1) print("Validation accuracy %f " % val_accuracy) if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy test_correct = 0.0 kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True) for _, test_index in kf_test: test_sents = [test[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels.reshape( (len(test_labels), opt.num_class)) x_test_batch, x_test_batch_mask = prepare_data_for_emb( test_sents, opt) test_accuracy = sess.run(accuracy_, feed_dict={ x_: x_test_batch, x_mask_: x_test_batch_mask, y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0 }) test_correct += test_accuracy * len(test_index) test_accuracy = test_correct / (len(test) + 0.1) print("Test accuracy %f " % test_accuracy) max_test_accuracy = test_accuracy print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) saver.save(sess, opt.save_path, global_step=epoch) print("Max Test accuracy %f " % max_test_accuracy) except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def main(): opt = Options() vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary( "data/atec_nlp_sim_train2.csv", opt.vocab_size, name_scope=opt.name_scope, tokenize_style=opt.tokenize_style) vocab_size = len(vocabulary_word2index) print("vocab_size:", vocab_size) num_classes = len(vocabulary_index2label) print("num_classes:", num_classes) with open("./cache_SWEM_1/train_valid_test.pik") as f: train, valid, test, true_label_percent = pickle.load(f) train_q, train_a, _, train_lab = train print("train_nums:", len(train_q)) val_q, val_a, _, val_lab = valid test_q, test_a, _, test_lab = test wordtoix = vocabulary_word2index ixtoword = vocabulary_index2word opt.n_words = len(ixtoword) # loadpath = "./data/snli.p" # x = cPickle.load(open(loadpath, "rb")) # # train, val, test = x[0], x[1], x[2] # wordtoix, ixtoword = x[4], x[5] # # train_q, train_a, train_lab = train[0], train[1], train[2] # val_q, val_a, val_lab = val[0], val[1], val[2] # test_q, test_a, test_lab = test[0], test[1], test[2] # # train_lab = np.array(train_lab, dtype='float32') # val_lab = np.array(val_lab, dtype='float32') # test_lab = np.array(test_lab, dtype='float32') # # opt = Options() # opt.n_words = len(ixtoword) # # del x print(dict(opt)) print('Total words: %d' % opt.n_words) #若partially use labeled data则进行以下操作,这部分操作什么意思? # 目前猜测part_data设置为True时只利用部分训练集,portion就是保留的训练集大小,应该是用于测试模型阶段使用的 if opt.part_data: np.random.seed(123) train_ind = np.random.choice(len(train_q), int(len(train_q) * opt.portion), replace=False) train_q = [train_q[t] for t in train_ind] train_a = [train_a[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] #验证训练集和预处理好的词嵌入文件是否对齐 try: params = np.load('./data/snli_emb.p') if params[0].shape == (opt.n_words, opt.embed_size): print('Use saved embedding.') #pdb.set_trace() opt.W_emb = np.array(params[0], dtype='float32') else: print('Emb Dimension mismatch: param_g.npz:' + str(params[0].shape) + ' opt: ' + str((opt.n_words, opt.embed_size))) opt.fix_emb = False except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:0'): #注意训练数据是两批句子,所以x的占位符要成对定义 x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_mask_1_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) x_mask_2_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category]) keep_prob = tf.placeholder(tf.float32) #auto_encoder就是模型的定义、模型运行过程中的所有tensor,这个项目将其封装起来了,很值得借鉴的工程技巧 # 返回的是一些重要的tensor,后面sess.run的时候作为参数传入 accuracy_, loss_, train_op_, W_emb, logits_ = auto_encoder( x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt) merged = tf.summary.merge_all() def do_eval(sess, train_q, train_a, train_lab): train_correct = 0.0 # number_examples = len(train_q) # print("valid examples:", number_examples) eval_loss, eval_accc, eval_counter = 0.0, 0.0, 0 eval_true_positive, eval_false_positive, eval_true_negative, eval_false_negative = 0, 0, 0, 0 # batch_size = 1 weights_label = {} # weight_label[label_index]=(number,correct) weights = np.ones((opt.batch_size)) kf_train = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents_1 = [train_q[t] for t in train_index] train_sents_2 = [train_a[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels_array = np.array(train_labels) # print("train_labels", train_labels.shape) # train_labels = train_labels.reshape((len(train_labels), opt.category)) train_labels = np.eye(opt.category)[train_labels_array] x_train_batch_1, x_train_mask_1 = prepare_data_for_emb( train_sents_1, opt) x_train_batch_2, x_train_mask_2 = prepare_data_for_emb( train_sents_2, opt) curr_eval_loss, curr_accc, logits = sess.run( [loss_, accuracy_, logits_], feed_dict={ x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2, y_: train_labels, opt.weights_label: weights, keep_prob: 1.0 }) true_positive, false_positive, true_negative, false_negative = compute_confuse_matrix( logits, train_labels ) # logits:[batch_size,label_size]-->logits[0]:[label_size] # write_predict_error_to_file(start,file_object,logits[0], evalY[start:end][0],vocabulary_index2word,evalX1[start:end],evalX2[start:end]) eval_loss, eval_accc, eval_counter = eval_loss + curr_eval_loss, eval_accc + curr_accc, eval_counter + 1 # 注意这里计算loss和accc的方法,计算累加值,然后归一化 weights_label = compute_labels_weights( weights_label, logits, train_labels_array ) # compute_labels_weights(weights_label,logits,labels) eval_true_positive, eval_false_positive = eval_true_positive + true_positive, eval_false_positive + false_positive eval_true_negative, eval_false_negative = eval_true_negative + true_negative, eval_false_negative + false_negative # weights_label = compute_labels_weights(weights_label, logits, evalY[start:end]) #compute_labels_weights(weights_label,logits,labels) print("true_positive:", eval_true_positive, ";false_positive:", eval_false_positive, ";true_negative:", eval_true_negative, ";false_negative:", eval_false_negative) p = float(eval_true_positive) / float(eval_true_positive + eval_false_positive) r = float(eval_true_positive) / float(eval_true_positive + eval_false_negative) f1_score = (2 * p * r) / (p + r) print("eval_counter:", eval_counter, ";eval_acc:", eval_accc) return eval_loss / float(eval_counter), eval_accc / float( eval_counter), f1_score, p, r, weights_label max_val_accuracy = 0. max_test_accuracy = 0. weights_dict = init_weights_dict( vocabulary_label2index) # init weights dict. # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: #若使用已保存好的参数 try: #pdb.set_trace() t_vars = tf.trainable_variables() # print([var.name[:-2] for var in t_vars]) save_keys = tensors_key_in_file(opt.save_path) # pdb.set_trace() # print(save_keys.keys()) ss = set([var.name for var in t_vars]) & set( [s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} #pdb.set_trace() # only restore variables with correct shape ss_right_shape = set( [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[ var for var in t_vars if var.name in ss_right_shape ]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: best_acc = 0 best_f1_score = 0 for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) loss, acc, uidx = 0.0, 0.0, 0.0 kf = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) #随机创建minibatch数据 for _, train_index in kf: uidx += 1 sents_1 = [train_q[t] for t in train_index] #根据索引回到总数据集中寻找相应数据 sents_2 = [train_a[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels_array = np.array(x_labels) # print("x_labels:", x_labels.shape) # 为何要在这里进行reshape,是想进行onehot操作?但是这明显是错误的,((len(x_labels),))怎么能reshape成((len(x_labels),opt.category)) # x_labels = x_labels.reshape((len(x_labels),opt.category)) # one-hot向量化 x_labels = np.eye(opt.category)[x_labels_array] #prepare_data_for_emb函数的作用是什么?初步猜测是把sents中每一个单词替换成相应的索引,然后才能根据索引获取词向量 x_batch_1, x_batch_mask_1 = prepare_data_for_emb( sents_1, opt) x_batch_2, x_batch_mask_2 = prepare_data_for_emb( sents_2, opt) weights = get_weights_for_current_batch( list(x_labels_array), weights_dict) _, curr_loss, curr_accuracy = sess.run( [train_op_, loss_, accuracy_], feed_dict={ x_1_: x_batch_1, x_2_: x_batch_2, x_mask_1_: x_batch_mask_1, x_mask_2_: x_batch_mask_2, y_: x_labels, opt.weights_label: weights, keep_prob: opt.dropout_ratio }) loss, acc = loss + curr_loss, acc + curr_accuracy if uidx % 100 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\t" % (epoch, uidx, loss / float(uidx), acc / float(uidx))) if epoch % 1 == 0: # do_eval参数待修改 eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval( sess, train_q, train_a, train_lab) weights_dict = get_weights_label_as_standard_dict( weights_label) # print("label accuracy(used for label weight):==========>>>>", weights_dict) print( "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, eval_loss, eval_accc, f1_scoree, precision, recall)) # save model to checkpoint if eval_accc > best_acc and f1_scoree > best_f1_score: save_path = opt.ckpt_dir + "/model.ckpt" print("going to save model. eval_f1_score:", f1_scoree, ";previous best f1 score:", best_f1_score, ";eval_acc", str(eval_accc), ";previous best_acc:", str(best_acc)) saver.save(sess, save_path, global_step=epoch) best_acc = eval_accc best_f1_score = f1_scoree test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval( sess, test_q, test_a, test_lab) print( "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:" % (test_loss, acc_t, f1_score_t, precision, recall)) #每训练valid_freq个minibatch就在训练集、验证集和测试集上计算准确率,并更新最优测试集准确率 # if uidx % opt.valid_freq == 0: # train_correct = 0.0 # kf_train = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) # for _, train_index in kf_train: # train_sents_1 = [train_q[t] for t in train_index] # train_sents_2 = [train_a[t] for t in train_index] # train_labels = [train_lab[t] for t in train_index] # train_labels = np.array(train_labels) # # print("train_labels", train_labels.shape) # # train_labels = train_labels.reshape((len(train_labels), opt.category)) # train_labels = np.eye(opt.category)[train_labels] # x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(train_sents_1, opt) # x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(train_sents_2, opt) # # train_accuracy = sess.run(accuracy_, # feed_dict={x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2, # y_: train_labels, keep_prob: 1.0}) # # train_correct += train_accuracy * len(train_index) # # train_accuracy = train_correct / len(train_q) # # # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx, # # loss, dis_loss, rec_loss)) # print("Train accuracy %f " % train_accuracy) # # val_correct = 0.0 # is_train = True # kf_val = get_minibatches_idx(len(val_q), opt.batch_size, shuffle=True) # for _, val_index in kf_val: # val_sents_1 = [val_q[t] for t in val_index] # val_sents_2 = [val_a[t] for t in val_index] # val_labels = [val_lab[t] for t in val_index] # val_labels = np.array(val_labels) # # val_labels = val_labels.reshape((len(val_labels), opt.category)) # val_labels = np.eye(opt.category)[val_labels] # x_val_batch_1, x_val_mask_1 = prepare_data_for_emb(val_sents_1, opt) # x_val_batch_2, x_val_mask_2 = prepare_data_for_emb(val_sents_2, opt) # # val_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_val_batch_1, x_2_: x_val_batch_2, # x_mask_1_: x_val_mask_1, x_mask_2_: x_val_mask_2, y_: val_labels, keep_prob: 1.0}) # # val_correct += val_accuracy * len(val_index) # # val_accuracy = val_correct / len(val_q) # # print("Validation accuracy %f " % val_accuracy) # # if val_accuracy > max_val_accuracy: # max_val_accuracy = val_accuracy # # test_correct = 0.0 # kf_test = get_minibatches_idx(len(test_q), opt.batch_size, shuffle=True) # for _, test_index in kf_test: # test_sents_1 = [test_q[t] for t in test_index] # test_sents_2 = [test_a[t] for t in test_index] # test_labels = [test_lab[t] for t in test_index] # test_labels = np.array(test_labels) # # test_labels = test_labels.reshape((len(test_labels), opt.category)) # test_labels = np.eye(opt.category)[test_labels] # x_test_batch_1, x_test_mask_1 = prepare_data_for_emb(test_sents_1, opt) # x_test_batch_2, x_test_mask_2 = prepare_data_for_emb(test_sents_2, opt) # # test_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_test_batch_1, x_2_: x_test_batch_2, # x_mask_1_: x_test_mask_1, x_mask_2_: x_test_mask_2, # y_: test_labels, keep_prob: 1.0}) # # test_correct += test_accuracy * len(test_index) # # test_accuracy = test_correct / len(test_q) # # print("Test accuracy %f " % test_accuracy) # # max_test_accuracy = test_accuracy # # print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) # # print("Max Test accuracy %f " % max_test_accuracy) except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def main(): loadpath = "./data/snli.p" x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] wordtoix, ixtoword = x[4], x[5] train_q, train_a, train_lab = train[0], train[1], train[2] val_q, val_a, val_lab = val[0], val[1], val[2] test_q, test_a, test_lab = test[0], test[1], test[2] train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt = Options() opt.n_words = len(ixtoword) del x print(dict(opt)) print('Total words: %d' % opt.n_words) if opt.part_data: np.random.seed(123) train_ind = np.random.choice(len(train_q), int(len(train_q)*opt.portion), replace=False) train_q = [train_q[t] for t in train_ind] train_a = [train_a[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] try: params = np.load('./data/snli_emb.p') if params[0].shape == (opt.n_words, opt.embed_size): print('Use saved embedding.') #pdb.set_trace() opt.W_emb = np.array(params[0], dtype='float32') else: print('Emb Dimension mismatch: param_g.npz:' + str(params[0].shape) + ' opt: ' + str( (opt.n_words, opt.embed_size))) opt.fix_emb = False except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:1'): x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_mask_1_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) x_mask_2_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category]) keep_prob = tf.placeholder(tf.float32) accuracy_, loss_, train_op_, W_emb_ = auto_encoder(x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt) merged = tf.summary.merge_all() uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: #pdb.set_trace() t_vars = tf.trainable_variables() # print([var.name[:-2] for var in t_vars]) save_keys = tensors_key_in_file(opt.save_path) # pdb.set_trace() # print(save_keys.keys()) ss = set([var.name for var in t_vars]) & set([s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} #pdb.set_trace() # only restore variables with correct shape ss_right_shape = set([s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[var for var in t_vars if var.name in ss_right_shape]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents_1 = [train_q[t] for t in train_index] sents_2 = [train_a[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = np.array(x_labels) x_labels = x_labels.reshape((len(x_labels), opt.category)) x_batch_1, x_batch_mask_1 = prepare_data_for_emb(sents_1, opt) x_batch_2, x_batch_mask_2 = prepare_data_for_emb(sents_2, opt) _, loss = sess.run([train_op_, loss_], feed_dict={x_1_: x_batch_1, x_2_: x_batch_2, x_mask_1_: x_batch_mask_1, x_mask_2_: x_batch_mask_2, y_: x_labels, keep_prob: opt.dropout_ratio}) if uidx % opt.valid_freq == 0: train_correct = 0.0 kf_train = get_minibatches_idx(3070, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents_1 = [train_q[t] for t in train_index] train_sents_2 = [train_a[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels.reshape((len(train_labels), opt.category)) x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(train_sents_1, opt) x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(train_sents_2, opt) train_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2, y_: train_labels, keep_prob: 1.0}) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 3070 # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx, # loss, dis_loss, rec_loss)) print("Train accuracy %f " % train_accuracy) val_correct = 0.0 is_train = True kf_val = get_minibatches_idx(len(val_q), opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents_1 = [val_q[t] for t in val_index] val_sents_2 = [val_a[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels.reshape((len(val_labels), opt.category)) x_val_batch_1, x_val_mask_1 = prepare_data_for_emb(val_sents_1, opt) x_val_batch_2, x_val_mask_2 = prepare_data_for_emb(val_sents_2, opt) val_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_val_batch_1, x_2_: x_val_batch_2, x_mask_1_: x_val_mask_1, x_mask_2_: x_val_mask_2, y_: val_labels, keep_prob: 1.0}) val_correct += val_accuracy * len(val_index) val_accuracy = val_correct / len(val_q) print("Validation accuracy %f " % val_accuracy) if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy test_correct = 0.0 kf_test = get_minibatches_idx(len(test_q), opt.batch_size, shuffle=True) for _, test_index in kf_test: test_sents_1 = [test_q[t] for t in test_index] test_sents_2 = [test_a[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels.reshape((len(test_labels), opt.category)) x_test_batch_1, x_test_mask_1 = prepare_data_for_emb(test_sents_1, opt) x_test_batch_2, x_test_mask_2 = prepare_data_for_emb(test_sents_2, opt) test_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_test_batch_1, x_2_: x_test_batch_2, x_mask_1_: x_test_mask_1, x_mask_2_: x_test_mask_2, y_: test_labels, keep_prob: 1.0}) test_correct += test_accuracy * len(test_index) test_accuracy = test_correct / len(test_q) print("Test accuracy %f " % test_accuracy) max_test_accuracy = test_accuracy print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) print("Max Test accuracy %f " % max_test_accuracy) except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def main(): loadpath = "./data/snli.p" x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] wordtoix, ixtoword = x[4], x[5] train_q, train_a, train_lab = train[0], train[1], train[2] val_q, val_a, val_lab = val[0], val[1], val[2] test_q, test_a, test_lab = test[0], test[1], test[2] train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt = Options() opt.n_words = len(ixtoword) del x print(dict(opt)) print('Total words: %d' % opt.n_words) #若partially use labeled data则进行以下操作,这部分操作什么意思? # 目前猜测part_data设置为True时只利用部分训练集,portion就是保留的训练集大小 if opt.part_data: np.random.seed(123) train_ind = np.random.choice(len(train_q), int(len(train_q) * opt.portion), replace=False) train_q = [train_q[t] for t in train_ind] train_a = [train_a[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] #验证训练集和预处理好的词嵌入文件是否对齐 try: params = np.load('./data/snli_emb.p') if params[0].shape == (opt.n_words, opt.embed_size): print('Use saved embedding.') #pdb.set_trace() opt.W_emb = np.array(params[0], dtype='float32') else: print('Emb Dimension mismatch: param_g.npz:' + str(params[0].shape) + ' opt: ' + str((opt.n_words, opt.embed_size))) opt.fix_emb = False except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:1'): #注意训练数据是两批句子,所以x的占位符要成对定义 x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen]) x_mask_1_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) x_mask_2_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen]) y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category]) keep_prob = tf.placeholder(tf.float32) #auto_encoder就是模型的定义、模型运行过程中的所有tensor,这个项目将其封装起来了,很值得借鉴的工程技巧 # 返回的是一些重要的tensor,后面sess.run的时候作为参数传入 accuracy_, loss_, train_op_, W_emb_ = auto_encoder( x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt) merged = tf.summary.merge_all() uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: #若使用已保存好的参数 try: #pdb.set_trace() t_vars = tf.trainable_variables() # print([var.name[:-2] for var in t_vars]) save_keys = tensors_key_in_file(opt.save_path) # pdb.set_trace() # print(save_keys.keys()) ss = set([var.name for var in t_vars]) & set( [s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} #pdb.set_trace() # only restore variables with correct shape ss_right_shape = set( [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[ var for var in t_vars if var.name in ss_right_shape ]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True) #随机创建minibatch数据 for _, train_index in kf: uidx += 1 sents_1 = [train_q[t] for t in train_index] #根据索引回到总数据集中寻找相应数据 sents_2 = [train_a[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = np.array(x_labels) print("x_labels:", x_labels.shape) x_labels = x_labels.reshape( (len(x_labels), opt.category)) #返回one-hot向量? #prepare_data_for_emb函数的作用是什么?初步猜测是把sents中每一个单词替换成相应的索引,然后才能根据索引获取词向量 x_batch_1, x_batch_mask_1 = prepare_data_for_emb( sents_1, opt) x_batch_2, x_batch_mask_2 = prepare_data_for_emb( sents_2, opt) _, loss = sess.run( [train_op_, loss_], feed_dict={ x_1_: x_batch_1, x_2_: x_batch_2, x_mask_1_: x_batch_mask_1, x_mask_2_: x_batch_mask_2, y_: x_labels, keep_prob: opt.dropout_ratio }) #每训练valid_freq个minibatch就在训练集、验证集和测试集上计算准确率,并更新最优测试集准确率 if uidx % opt.valid_freq == 0: train_correct = 0.0 kf_train = get_minibatches_idx(3070, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents_1 = [train_q[t] for t in train_index] train_sents_2 = [train_a[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels.reshape( (len(train_labels), opt.category)) x_train_batch_1, x_train_mask_1 = prepare_data_for_emb( train_sents_1, opt) x_train_batch_2, x_train_mask_2 = prepare_data_for_emb( train_sents_2, opt) train_accuracy = sess.run(accuracy_, feed_dict={ x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2, y_: train_labels, keep_prob: 1.0 }) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 3070 # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx, # loss, dis_loss, rec_loss)) print("Train accuracy %f " % train_accuracy) val_correct = 0.0 is_train = True kf_val = get_minibatches_idx(len(val_q), opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents_1 = [val_q[t] for t in val_index] val_sents_2 = [val_a[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels.reshape( (len(val_labels), opt.category)) x_val_batch_1, x_val_mask_1 = prepare_data_for_emb( val_sents_1, opt) x_val_batch_2, x_val_mask_2 = prepare_data_for_emb( val_sents_2, opt) val_accuracy = sess.run(accuracy_, feed_dict={ x_1_: x_val_batch_1, x_2_: x_val_batch_2, x_mask_1_: x_val_mask_1, x_mask_2_: x_val_mask_2, y_: val_labels, keep_prob: 1.0 }) val_correct += val_accuracy * len(val_index) val_accuracy = val_correct / len(val_q) print("Validation accuracy %f " % val_accuracy) if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy test_correct = 0.0 kf_test = get_minibatches_idx(len(test_q), opt.batch_size, shuffle=True) for _, test_index in kf_test: test_sents_1 = [test_q[t] for t in test_index] test_sents_2 = [test_a[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels.reshape( (len(test_labels), opt.category)) x_test_batch_1, x_test_mask_1 = prepare_data_for_emb( test_sents_1, opt) x_test_batch_2, x_test_mask_2 = prepare_data_for_emb( test_sents_2, opt) test_accuracy = sess.run(accuracy_, feed_dict={ x_1_: x_test_batch_1, x_2_: x_test_batch_2, x_mask_1_: x_test_mask_1, x_mask_2_: x_test_mask_2, y_: test_labels, keep_prob: 1.0 }) test_correct += test_accuracy * len(test_index) test_accuracy = test_correct / len(test_q) print("Test accuracy %f " % test_accuracy) max_test_accuracy = test_accuracy print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) print("Max Test accuracy %f " % max_test_accuracy) except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def main(): # Prepare training and testing data opt = Options() main_Path = '/home/dell/桌面/GG/TDD/keyword/Our_method/dataset/' # load data if opt.dataset == 'yahoo': loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yahoo.p" embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yahoo_glove.p" load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/yahoo/08/cnn/yahoo_G.pickle' opt.num_class = 10 opt.class_name = ['Society Culture', 'Science Mathematics', 'Health', 'Education Reference', 'Computers Internet', 'Sports', 'Business Finance', 'Entertainment Music', 'Family Relationships', 'Politics Government'] elif opt.dataset == 'agnews': loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/ag_news.pickle" embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/ag_news_glove.pickle" load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/ag_news/08/cnn/ag_news_G.pickle' opt.num_class = 4 opt.class_name = ['World', 'Sports', 'Business', 'Science'] elif opt.dataset == 'dbpedia': loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/dbpedia.pickle" embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/dbpedia_glove.pickle" load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/dbpedia/08/cnn/dbpedia_G.pickle' opt.num_class = 14 opt.class_name = ['Company', 'Educational Institution', 'Artist', 'Athlete', 'Office Holder', 'Mean Of Transportation', 'Building', 'Natural Place', 'Village', 'Animal', 'Plant', 'Album', 'Film', 'Written Work', ] elif opt.dataset == 'yelp_full': loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp_full.pickle" embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp_full_glove.pickle" load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/yelp_full/08/cnn/yelp_full_G.pickle' opt.num_class = 5 opt.class_name = ['worst', 'bad', 'middle', 'good', 'best'] elif opt.dataset == 'yelp': loadpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp.pickle" embpath = "/home/dell/PycharmProjects/NLP/Idea-1/New_leam_dataset/yelp_glove.pickle" load_G_path = '/home/dell/PycharmProjects/NLP/Idea-1/Results/yelp/08/cnn/yelp_G.pickle' opt.num_class = 2 opt.class_name = ['bad', 'good'] x = pickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] #将单词由数字表示,已做了分词工作,且句子长度尚未统一 train_lab, val_lab, test_lab = x[3], x[4], x[5]#label 采用one-hot编码形式表示 wordtoix, ixtoword = x[6], x[7] #加载权重G G_train, G_val, G_test = pickle.load(open(load_G_path, "rb")) del x print("load data finished") train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt.n_words = len(ixtoword) if opt.part_data: #np.random.seed(123) train_ind = np.random.choice(len(train), int(len(train)*opt.portion), replace=False) train = [train[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID) print(dict(opt)) print('Total words: %d' % opt.n_words) try: opt.W_emb = np.array(pickle.load(open(embpath, 'rb')),dtype='float32') opt.W_class_emb = load_class_embedding( wordtoix, opt) except IOError: print('No embedding file found.') opt.fix_emb = False with tf.device('/gpu:1'): x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen],name='x_') x_mask_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen],name='x_mask_') keep_prob = tf.placeholder(tf.float32,name='keep_prob') y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.num_class],name='y_') class_penalty_ = tf.placeholder(tf.float32, shape=()) G_our = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen, opt.num_class], name='G_our') seq_len = tf.placeholder(tf.int32, shape=[opt.batch_size], name='sque_sentence_num') accuracy_, loss_, train_op, W_norm_, global_step = emb_classifier(x_, x_mask_, y_, keep_prob, opt, class_penalty_, G_our, seq_len) uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. val_acc = 0. config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, ) config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.trainable_variables() save_keys = tensors_key_in_file(opt.save_path) ss = set([var.name for var in t_vars]) & set([s + ":0" for s in save_keys.keys()]) cc = {var.name: var for var in t_vars} # only restore variables with correct shape ss_right_shape = set([s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) loader = tf.train.Saver(var_list=[var for var in t_vars if var.name in ss_right_shape]) loader.restore(sess, opt.save_path) print("Loading variables from '%s'." % opt.save_path) print("Loaded variables:" + str(ss)) except: print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) try: for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] G1 = [G_train[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = np.array(x_labels) x_labels = x_labels.reshape((len(x_labels), opt.num_class)) x_batch, x_batch_mask, G_batch, seq_len_batch = prepare_data_for_emb(sents, G1, opt) _, loss, step, = sess.run([train_op, loss_, global_step], feed_dict={x_: x_batch, x_mask_: x_batch_mask, y_: x_labels, keep_prob: opt.dropout, class_penalty_:opt.class_penalty, G_our:G_batch, seq_len:seq_len_batch}) if uidx % opt.valid_freq == 0: train_correct = 0.0 # sample evaluate accuaccy on 500 sample data kf_train = get_minibatches_idx(500, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents = [train[t] for t in train_index] train_G = [G_train[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels.reshape((len(train_labels), opt.num_class)) x_train_batch, x_train_batch_mask, G_train_batch, x_train_seq_len = prepare_data_for_emb(train_sents, train_G, opt) train_accuracy = sess.run(accuracy_, feed_dict={x_: x_train_batch, x_mask_: x_train_batch_mask, y_: train_labels, keep_prob: 1.0, class_penalty_:0.0, G_our:G_train_batch, seq_len:x_train_seq_len}) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 500 print("Iteration %d: Training loss %f " % (uidx, loss)) print("Train accuracy %f " % train_accuracy) if not os.path.exists(opt.dataset + '_Train_message.csv'): with open(opt.dataset + '_Train_message.csv', 'a', newline='') as out: # 设定写入模式 csv_write = csv.writer(out, dialect='excel') # 写入具体内容 csv_write.writerow(["epoch", "Training loss", "Train accuracy"]) csv_write.writerow([epoch, loss, train_accuracy]) else: with open(opt.dataset + '_Train_message.csv', 'a', newline='') as out: # 设定写入模式 csv_write = csv.writer(out, dialect='excel') csv_write.writerow([epoch, loss, train_accuracy]) val_correct = 0.0 kf_val = get_minibatches_idx(len(val), opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents = [val[t] for t in val_index] val_Gs = [G_val[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels.reshape((len(val_labels), opt.num_class)) x_val_batch, x_val_batch_mask, G_val_batch, x_val_seq_len = prepare_data_for_emb(val_sents, val_Gs, opt) val_accuracy = sess.run(accuracy_, feed_dict={x_: x_val_batch, x_mask_: x_val_batch_mask, y_: val_labels, keep_prob: 1.0, class_penalty_:0.0, G_our:G_val_batch, seq_len:x_val_seq_len}) val_correct += val_accuracy * len(val_index) val_accuracy = val_correct / len(val) print("Validation accuracy %f " % val_accuracy) #测试网络 test_correct = 0.0 kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True) for _, test_index in kf_test: test_sents = [test[t] for t in test_index] test_Gs = [G_test[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels.reshape((len(test_labels), opt.num_class)) x_test_batch, x_test_batch_mask, G_test_batch, x_test_seq_len = prepare_data_for_emb(test_sents, test_Gs, opt) test_accuracy = sess.run(accuracy_, feed_dict={x_: x_test_batch, x_mask_: x_test_batch_mask, y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0, G_our: G_test_batch, seq_len:x_test_seq_len}) test_correct += test_accuracy * len(test_index) test_accuracy = test_correct / len(test) print("Test accuracy %f " % test_accuracy) # max_test_accuracy = test_accuracy if test_accuracy > max_test_accuracy: max_test_accuracy = test_accuracy val_acc = val_accuracy # max_test_accuracy = max(test_accuracy, max_test_accuracy) # val_acc = val_accuracy if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy test_acc = test_accuracy if not os.path.exists(opt.dataset + '_Classification_Results.csv'): with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out: # 设定写入模式 csv_write = csv.writer(out, dialect='excel') # 写入具体内容 csv_write.writerow(["epoch", "val_accuracy", "test_accuracy"]) csv_write.writerow([epoch, val_accuracy, test_accuracy]) else: with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out: # 设定写入模式 csv_write = csv.writer(out, dialect='excel') csv_write.writerow([epoch, val_accuracy, test_accuracy]) print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) saver.save(sess, opt.save_path, global_step=epoch) print("Max Test accuracy %f , val accuracy %f " % (max_test_accuracy, val_acc)) print("Max val accuracy %f , test accuracy %f" % (max_val_accuracy, test_acc)) with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out: # 设定写入模式 csv_write = csv.writer(out, dialect='excel') csv_write.writerow(['Max Test accuracy:', max_test_accuracy, 'val accuracy', val_acc]) csv_write.writerow(['Max val accuracy:', max_val_accuracy, 'test accuracy', test_acc]) except KeyboardInterrupt: print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy) with open(opt.dataset + '_Classification_Results.csv', 'a', newline='') as out: # 设定写入模式 csv_write = csv.writer(out, dialect='excel') csv_write.writerow(['Max Test accuracy:', max_test_accuracy, 'val accuracy', val_acc]) csv_write.writerow(['Max val accuracy:', max_val_accuracy, 'test accuracy', test_acc])
def main(): loadpath = "./yahoo4char.p" embpath = "./yahoo_glove.p" x = pickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] train_lab, val_lab, test_lab = x[3], x[4], x[5] wordtoix, ixtoword = x[6], x[7] del x print("load data finished") train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt = Options() opt.num_class = 10 opt.class_name = ['Society Culture', 'Science Mathematics', 'Health', 'Education Reference', 'Computers Internet', 'Sports', 'Business Finance', 'Entertainment Music', 'Family Relationships', 'Politics Government'] opt.n_words = len(ixtoword) # os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPUID) opt.W_emb = np.array(pickle.load(open(embpath, 'rb'))[0], dtype='float32') opt.W_class_emb = load_class_embedding(wordtoix, opt) with tf.device('/gpu:0'): x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen],name='x_') x_mask_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.maxlen],name='x_mask_') keep_prob = tf.placeholder(tf.float32,name='keep_prob') y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.num_class],name='y_') class_penalty_ = tf.placeholder(tf.float32, shape=()) accuracy_, loss_, train_op, W_norm_, global_step = emb_classifier(x_, x_mask_, y_, keep_prob, opt, class_penalty_) uidx = 0 max_val_accuracy = 0. max_test_accuracy = 0. config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, ) # config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = np.array(x_labels) x_labels = x_labels - 1 x_labels = to_categorical(x_labels) x_batch, x_batch_mask = prepare_data_for_emb(sents, opt) _, loss, step, = sess.run([train_op, loss_, global_step], feed_dict={x_: x_batch, x_mask_: x_batch_mask, y_: x_labels, keep_prob: opt.dropout, class_penalty_: opt.class_penalty}) if uidx % opt.valid_freq == 0: train_correct = 0.0 # sample evaluate accuaccy on 500 sample data kf_train = get_minibatches_idx(500, opt.batch_size, shuffle=True) for _, train_index in kf_train: train_sents = [train[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = np.array(train_labels) train_labels = train_labels - 1 train_labels = to_categorical(train_labels) x_train_batch, x_train_batch_mask = prepare_data_for_emb(train_sents, opt) train_accuracy = sess.run(accuracy_, feed_dict={x_: x_train_batch, x_mask_: x_train_batch_mask, y_: train_labels, keep_prob: 1.0, class_penalty_: 0.0}) train_correct += train_accuracy * len(train_index) train_accuracy = train_correct / 500 print("Iteration %d: Training loss %f " % (uidx, loss)) print("Train accuracy %f " % train_accuracy) val_correct = 0.0 kf_val = get_minibatches_idx(len(val), opt.batch_size, shuffle=True) for _, val_index in kf_val: val_sents = [val[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = np.array(val_labels) val_labels = val_labels - 1 val_labels = to_categorical(val_labels) x_val_batch, x_val_batch_mask = prepare_data_for_emb(val_sents, opt) val_accuracy = sess.run(accuracy_, feed_dict={x_: x_val_batch, x_mask_: x_val_batch_mask, y_: val_labels, keep_prob: 1.0, class_penalty_: 0.0}) val_correct += val_accuracy * len(val_index) val_accuracy = val_correct / len(val) print("Validation accuracy %f " % val_accuracy) if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy test_correct = 0.0 kf_test = get_minibatches_idx(len(test), opt.batch_size, shuffle=True) for _, test_index in kf_test: test_sents = [test[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = np.array(test_labels) test_labels = test_labels - 1 test_labels = to_categorical(test_labels) x_test_batch, x_test_batch_mask = prepare_data_for_emb(test_sents, opt) test_accuracy = sess.run(accuracy_, feed_dict={x_: x_test_batch, x_mask_: x_test_batch_mask, y_: test_labels, keep_prob: 1.0, class_penalty_: 0.0}) test_correct += test_accuracy * len(test_index) test_accuracy = test_correct / len(test) print("Test accuracy %f " % test_accuracy) max_test_accuracy = test_accuracy print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) saver.save(sess, opt.save_path, global_step=epoch) print("Max Test accuracy %f " % max_test_accuracy) print('Training interupted') print("Max Test accuracy %f " % max_test_accuracy)
def training(args): x = pickle.load(open(args.filename, 'rb'), encoding='latin1') train, val, test = x[0], x[1], x[2] train_lab, val_lab, test_lab = x[3], x[4], x[5] del x print("load data finished") train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') model = LEAM(args) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) if args.part_data: # np.random.seed(123) train_ind = np.random.choice(len(train), int(len(train) * args.portion), replace=False) train = [train[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] if args.restore: try: checkpoint = torch.load(args.save_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch_start = checkpoint['epoch'] losses = checkpoint['losses'] max_val_accuracy = checkpoint['max_val_accuracy'] max_test_accuracy = checkpoint['max_test_accuracy'] val_accuracies = checkpoint['val_accuracies'] train_accuracies = checkpoint['train_accuracies'] model.train() except: print("No saving session") else: epoch_start = 1 max_val_accuracy = 0. max_test_accuracy = 0. losses = [] val_accuracies = [] train_accuracies = [] steps = 0 for epoch in range(epoch_start, args.n_epochs + 1): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), args.batch_size, shuffle=True) for _, train_index in kf: steps += 1 sents = [train[t] for t in train_index] x_labels = [train_lab[t] for t in train_index] x_labels = torch.FloatTensor(x_labels) x_labels = x_labels.reshape((len(x_labels), args.num_class)) x_batch, x_batch_mask = prepare_data_for_emb(sents, args) optimizer.zero_grad() logits, logits_class, _ = model(x_batch, x_batch_mask) class_y = torch.LongTensor(np.arange(args.num_class)) class_x = torch.max(x_labels, 1)[1] loss = torch.mean(F.cross_entropy(logits, class_x)) + \ args.class_penalty * torch.mean(F.cross_entropy(logits_class, class_y)) loss.backward() optimizer.step() if steps % args.valid_freq == 0: train_correct = 0 # sample evaluate accuaccy on 500 sample data kf_train = get_minibatches_idx(1000, args.batch_size, shuffle=True) for _, train_index in kf_train: train_sents = [train[t] for t in train_index] train_labels = [train_lab[t] for t in train_index] train_labels = torch.FloatTensor(train_labels) train_labels = train_labels.reshape((len(train_labels), args.num_class)) x_train_batch, x_train_batch_mask = prepare_data_for_emb(train_sents, args) train_logits, _, _ = model(x_train_batch, x_train_batch_mask) train_correct += (torch.max(train_logits, 1)[1].data == torch.max(train_labels, 1)[1].data).sum().item() train_accuracy = train_correct / 1000 print("Iteration %d: Training loss %f " % (steps, loss)) print("Train accuracy %f " % train_accuracy) losses.append(loss) train_accuracies.append(train_accuracy) val_correct = 0.0 kf_val = get_minibatches_idx(len(val), args.batch_size, shuffle=True) for _, val_index in kf_val: val_sents = [val[t] for t in val_index] val_labels = [val_lab[t] for t in val_index] val_labels = torch.FloatTensor(val_labels) val_labels = val_labels.reshape((len(val_labels), args.num_class)) x_val_batch, x_val_batch_mask = prepare_data_for_emb(val_sents, args) val_logits, _, _ = model(x_val_batch, x_val_batch_mask) val_correct += (torch.max(val_logits, 1)[1].data == torch.max(val_labels, 1)[1].data).sum().item() val_accuracy = val_correct / len(val) print("Validation accuracy %f " % val_accuracy) val_accuracies.append(val_accuracy) if val_accuracy > max_val_accuracy: max_val_accuracy = val_accuracy test_correct = 0.0 kf_test = get_minibatches_idx(len(test), args.batch_size, shuffle=True) for _, test_index in kf_test: test_sents = [test[t] for t in test_index] test_labels = [test_lab[t] for t in test_index] test_labels = torch.FloatTensor(test_labels) test_labels = test_labels.reshape((len(test_labels), args.num_class)) x_test_batch, x_test_batch_mask = prepare_data_for_emb(test_sents, args) test_logits, _, _ = model(x_test_batch, x_test_batch_mask) test_correct += (torch.max(test_logits, 1)[1].data == torch.max(test_labels, 1)[1].data).sum().item() test_accuracy = test_correct / len(test) print("Test accuracy %f " % test_accuracy) max_test_accuracy = test_accuracy visualize(args, losses, train_accuracies, val_accuracies) print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'max_val_accuracy': max_val_accuracy, 'max_test_accuracy': max_test_accuracy, 'val_accuracies': val_accuracies, 'train_accuracies': train_accuracies, }, args.save_path) print("Max Test accuracy %f " % max_test_accuracy)