def dump_word_dict(): word_dict = build_word_dict(train_data_path) start_index = len(word_dict) + 4 + 1 word_dict.update(build_word_dict(valid_data_path,start_index)) word_dict["<pad>"] = 0 word_dict["<bos>"] = 1 word_dict["<eos>"] = 2 word_dict["<unk>"] = 3 print('dumping word_dict...') with open(word_dict_path, 'w') as f: json.dump(word_dict,f) print('word_dict dumped')
train_step(batch_x, batch_y) step = tf.train.global_step(sess, global_step) if step % 200 == 0: test_acc = test_accuracy(test_x, test_y) print("test_accuracy = {0}\n".format(test_acc)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--pre_trained", type=str, default="none", help="none | auto_encoder | language_model") parser.add_argument("--summary_dir", type=str, default="classifier", help="summary dir.") args = parser.parse_args() if not os.path.exists("dbpedia_csv"): print("Downloading dbpedia dataset...") download_dbpedia() print("\nBuilding dictionary..") word_dict = build_word_dict() print("Preprocessing dataset..") train_x, train_y = build_word_dataset("train", word_dict, MAX_DOCUMENT_LEN) test_x, test_y = build_word_dataset("test", word_dict, MAX_DOCUMENT_LEN) train(train_x, train_y, test_x, test_y, len(word_dict), args)
args.summary_dir = path args.model_dir = model_dir write_csv_files(train_text_dirs, test_text_dirs, args.labels, args.labels, path, 'train.csv', 'test.csv', args.labeled_data_num, args.test_data_num) train_path = os.path.join(path, 'train.csv') test_path = os.path.join(path, 'test.csv') print("\nBuilding dictionary..") if args.pre_trained == 'none': unlabeled_csv_file = 'unlabeled_150000.csv' unlabeled_csv_path = os.path.join(model_dir, unlabeled_csv_file) if not os.path.exists(unlabeled_csv_path): write_csv_file([os.path.join(dataset_dir, args.data_type + '.txt')], [-1], model_dir, unlabeled_csv_file, 150000) print("\nBuilding dictionary..") word_dict = build_word_dict(model_dir, 20000, unlabeled_csv_path) print("Preprocessing dataset..") # word_dict = build_word_dict(model_dir, None, train_path) else: word_dict = build_word_dict(model_dir, None) print("Preprocessing dataset..") label_map = dict() k = 0 for label in args.labels: label_map[label] = k k = k + 1 train_x, train_y = build_word_dataset(train_path, test_path, "train", word_dict, args.max_document_len, label_map, up_sample=args.up_sample) test_x, test_y = build_word_dataset(train_path, test_path, "test", word_dict, args.max_document_len, label_map) logout_config(args, train_y, test_y) train(train_x, train_y, test_x, test_y, len(word_dict), args)
if not os.path.exists(train_text_dir): with open(train_text_dir, 'w', encoding='utf8') as f_train: f_train.writelines(all_lines[:train_sample]) if not os.path.exists(test_text_dir): with open(test_text_dir, 'w', encoding='utf8') as f_test: f_test.writelines(all_lines[-test_sample:]) model_dir = get_train_path(args) args.model_dir = model_dir write_csv_files(train_text_dirs, test_text_dirs, args.labels, args.labels, model_dir, 'train.csv', 'test.csv') # args.labeled_data_num, args.test_data_num) train_path = os.path.join(model_dir, 'train.csv') test_path = os.path.join(model_dir, 'test.csv') print("\nBuilding dictionary..") word_dict = build_word_dict(dataset_dir) embed_dict = build_embedding(word_dict, dataset_dir) print("Preprocessing dataset..") label_map = dict() k = 0 for label in args.labels: label_map[label] = k k = k + 1 train_x, train_y, valid_x, valid_y = build_word_dataset( train_path, test_path, "train", word_dict, args.max_document_len, label_map, up_sample=args.up_sample)
str(args.num_hidden), str(args.hidden_layers) ])) if os.path.exists(path) is not True: os.makedirs(path) args.summary_dir = path args.model_dir = model_dir write_csv_files(train_text_dirs, test_text_dirs, args.labels, args.labels, path, 'train.csv', 'test.csv', args.labeled_data_num, args.test_data_num) train_path = os.path.join(path, 'train.csv') test_path = os.path.join(path, 'test.csv') print("\nBuilding dictionary..") if args.pre_trained == 'none': word_dict = build_word_dict(model_dir, None, train_path) else: word_dict = build_word_dict(model_dir, None) print("Preprocessing dataset..") label_map = dict() k = 0 for label in args.labels: label_map[label] = k k = k + 1 train_x, train_y = build_word_dataset(train_path, test_path, "train", word_dict, args.max_document_len, label_map, up_sample=args.up_sample)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, default="language_model", help="auto_encoder | language_model") # parser.add_argument("--model_name", type=str, default="model", help="the folder name of the model") parser.add_argument("--dict_size", type=int, default=20000, help="the max size of word dictionary") parser.add_argument("--data_folder", type=str, default="ACL", help="ACL | Markov | huffman_tree | two_tree") parser.add_argument("--data_type", type=str, default="news", help="movie | news | tweet") parser.add_argument("--unlabeled_data_num", type=int, default=50000, help="how many unlabeled data samples to use") parser.add_argument("--batch_size", type=int, default=128, help="batch size") parser.add_argument("--lr", type=float, default=0.001, help="learning rate") parser.add_argument("--num_epochs", type=int, default=10, help="epoch num") parser.add_argument("--max_document_len", type=int, default=30, help="max length of sentence") args = parser.parse_args() dataset_dir = os.path.join("dataset", args.data_folder, args.data_type) unlabeled_text_dirs = [os.path.join(dataset_dir, args.data_type + '.txt')] model_dir = os.path.join(args.model, args.data_folder, args.data_type, str(args.unlabeled_data_num)) unlabeled_csv_file = 'unlabeled_' + str(args.unlabeled_data_num) + '.csv' unlabeled_csv_path = os.path.join(model_dir, unlabeled_csv_file) if not os.path.exists(unlabeled_csv_path): write_csv_file(unlabeled_text_dirs, [-1], model_dir, unlabeled_csv_file, args.unlabeled_data_num) print("\nBuilding dictionary..") word_dict = build_word_dict(model_dir, args.dict_size, unlabeled_csv_path) print("Preprocessing dataset..") train_x, train_y = build_word_dataset(unlabeled_csv_path, None, "train", word_dict, args.max_document_len) logout_config(args, model_dir, len(word_dict)) train(train_x, train_y, word_dict, args, model_dir)