train_step(batch_x, batch_y) step = tf.train.global_step(sess, global_step) if step % 200 == 0: test_acc = test_accuracy(test_x, test_y) print("test_accuracy = {0}\n".format(test_acc)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--pre_trained", type=str, default="none", help="none | auto_encoder | language_model") parser.add_argument("--summary_dir", type=str, default="classifier", help="summary dir.") args = parser.parse_args() if not os.path.exists("dbpedia_csv"): print("Downloading dbpedia dataset...") download_dbpedia() print("\nBuilding dictionary..") word_dict = build_word_dict() print("Preprocessing dataset..") train_x, train_y = build_word_dataset("train", word_dict, MAX_DOCUMENT_LEN) test_x, test_y = build_word_dataset("test", word_dict, MAX_DOCUMENT_LEN) train(train_x, train_y, test_x, test_y, len(word_dict), args)
from pre_train import train from data_utils import build_word_dict, build_word_dataset import pickle from sklearn.metrics.pairwise import cosine_similarity import numpy as np from query import QuestionEmbedding MAX_DOCUMENT_LEN = 50 file_dir = "data/wikismall-complex.txt" data = open(file_dir, encoding='utf-8', errors='ignore').read().split('\n') # Training all data and convert it to new Embedded representation : print("\nBuilding dictionary..") word_dict = build_word_dict() print("Preprocessing dataset..") train_x, train_y = build_word_dataset(word_dict, MAX_DOCUMENT_LEN) embedded_data = train(train_x, train_y, word_dict) # Test : Convert Question to embedded representation base on previous Trained model weights. print(" Question: ") tf.reset_default_graph() query1 = "His Seven Stars Symphony features movements inspired by Douglas Fairbanks , Lilian Harvey ," \ " Greta Garbo , Clara Bow , Marlene Dietrich , Emil Jannings and Charlie Chaplin in some of " \ "their most famous film roles ." with open("word_dict.pickle", "rb") as f: word_dict = pickle.load(f) question_embedding = QuestionEmbedding(query1, word_dict) embed_question = question_embedding.query() def sum_embedded_words(encode_question):
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, default="language_model", help="auto_encoder | language_model") # parser.add_argument("--model_name", type=str, default="model", help="the folder name of the model") parser.add_argument("--dict_size", type=int, default=20000, help="the max size of word dictionary") parser.add_argument("--data_folder", type=str, default="ACL", help="ACL | Markov | huffman_tree | two_tree") parser.add_argument("--data_type", type=str, default="news", help="movie | news | tweet") parser.add_argument("--unlabeled_data_num", type=int, default=50000, help="how many unlabeled data samples to use") parser.add_argument("--batch_size", type=int, default=128, help="batch size") parser.add_argument("--lr", type=float, default=0.001, help="learning rate") parser.add_argument("--num_epochs", type=int, default=10, help="epoch num") parser.add_argument("--max_document_len", type=int, default=30, help="max length of sentence") args = parser.parse_args() dataset_dir = os.path.join("dataset", args.data_folder, args.data_type) unlabeled_text_dirs = [os.path.join(dataset_dir, args.data_type + '.txt')] model_dir = os.path.join(args.model, args.data_folder, args.data_type, str(args.unlabeled_data_num)) unlabeled_csv_file = 'unlabeled_' + str(args.unlabeled_data_num) + '.csv' unlabeled_csv_path = os.path.join(model_dir, unlabeled_csv_file) if not os.path.exists(unlabeled_csv_path): write_csv_file(unlabeled_text_dirs, [-1], model_dir, unlabeled_csv_file, args.unlabeled_data_num) print("\nBuilding dictionary..") word_dict = build_word_dict(model_dir, args.dict_size, unlabeled_csv_path) print("Preprocessing dataset..") train_x, train_y = build_word_dataset(unlabeled_csv_path, None, "train", word_dict, args.max_document_len) logout_config(args, model_dir, len(word_dict)) train(train_x, train_y, word_dict, args, model_dir)
args.model_dir = model_dir write_csv_files(train_text_dirs, test_text_dirs, args.labels, args.labels, path, 'train.csv', 'test.csv', args.labeled_data_num, args.test_data_num) train_path = os.path.join(path, 'train.csv') test_path = os.path.join(path, 'test.csv') print("\nBuilding dictionary..") if args.pre_trained == 'none': unlabeled_csv_file = 'unlabeled_150000.csv' unlabeled_csv_path = os.path.join(model_dir, unlabeled_csv_file) if not os.path.exists(unlabeled_csv_path): write_csv_file([os.path.join(dataset_dir, args.data_type + '.txt')], [-1], model_dir, unlabeled_csv_file, 150000) print("\nBuilding dictionary..") word_dict = build_word_dict(model_dir, 20000, unlabeled_csv_path) print("Preprocessing dataset..") # word_dict = build_word_dict(model_dir, None, train_path) else: word_dict = build_word_dict(model_dir, None) print("Preprocessing dataset..") label_map = dict() k = 0 for label in args.labels: label_map[label] = k k = k + 1 train_x, train_y = build_word_dataset(train_path, test_path, "train", word_dict, args.max_document_len, label_map, up_sample=args.up_sample) test_x, test_y = build_word_dataset(train_path, test_path, "test", word_dict, args.max_document_len, label_map) logout_config(args, train_y, test_y) train(train_x, train_y, test_x, test_y, len(word_dict), args)
flags.DEFINE_integer("batch_size_val", 1000, "the size of one validation batch data") flags.DEFINE_integer("num_epochs", 10, "the number of epochs") FLAGS = flags.FLAGS # vocabulary tokens = build_tokens() # len(tokens)=30003 # 模型图 # tf.reset_default_graph() model = TextCNN(tokens, FLAGS.sentence_len, FLAGS.embed_size, FLAGS.num_classes, FLAGS.l2_lambda, FLAGS.learning_rate, FLAGS.kernels_size, FLAGS.filter_nums, FLAGS.static, FLAGS.keep_prob) # 词典, 整理数据集 X, y = build_word_dataset("train", tokens, FLAGS.sentence_len) # [560000, 50] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) saver = tf.train.Saver() merged = tf.summary.merge_all() with tf.Session() as sess: train_writer = tf.summary.FileWriter("./temp/v2/graph/train", graph=sess.graph) val_writer = tf.summary.FileWriter("./temp/v2/graph/val") sess.run(tf.global_variables_initializer()) train_data = batch_iter(X_train, y_train, FLAGS.batch_size, FLAGS.num_epochs) # (448000/60)=7466 val_data = batch_iter(X_val, y_val, FLAGS.batch_size_val, FLAGS.num_epochs) # (112000/10000)*200=11.2*200=2240 # 迭代2240步就能计算一个val准确率 best_val_acc = 0