def run_func(): config = Config() # ========= Load Dataset ========= # You can change this code to load dataset in your own way vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "data/squad/fuse.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('temp/fuse-answer.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def run_func2(dataset, config): vocab, rev_vocab = initialize_vocab(config.vocab_path) q, c, a = zip(*[[_q, _c, _a] for (_q, _c, _a) in dataset]) dataset = [q, c, a] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) question_uuid_data = [i for i in xrange(len(a))] with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, answers_canonical = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) # write to json file to root dir with io.open('dev-prediction.txt', 'w', encoding='utf-8') as f: for i in xrange(len(a)): curr_ans = unicode(answers[i], "utf-8") f.write("%s\n" % (curr_ans))
def run_func(): config = Config() vocab, rev_vocab = initialize_vocab(config.vocab_path) dev_path = "download/squad/test.json" dev_dirname = os.path.dirname(os.path.abspath(dev_path)) dev_filename = os.path.basename(dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) ques_len = len(question_data) answers = [[0, 0] for _ in xrange(ques_len)] dataset = [question_data, context_data, answers] embed_path = config.embed_path embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) data = "Id,Answer\n" with tf.Session() as sess: qa.initialize_model(sess, config.train_dir) answers, _ = generate_answers(sess, qa, dataset, question_uuid_data, rev_vocab) for a in answers: ans = answers[a] data += a + "," + normalize_answer(ans).replace(" s ", "s ") + "\n" with open('submission.csv', 'wb') as file: file.write(data)
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train') datasetVal = initialize_datasets(FLAGS.data_dir, 'val') #datasetTrain = datasetTrain[0:100] embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, embed_path, rev_vocab, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, datasetTrain, save_train_dir) qa.evaluate_answer(sess, datasetVal, sample=1000, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir # load all in once, maybe better to try batch by batch question_path = "./data/squad/train.ids.question" context_path = "./data/squad/train.ids.context" answer_path = "./data/squad/train.span" val_q = "./data/squad/val.ids.question" val_c = "./data/squad/val.ids.context" val_a = "./data/squad/val.span" embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") # embeddings is a matrix of shape [vocab_size, embedding_size] embeddings = np.load(embed_path)['glove'].astype(np.float32) val_data = load_and_pad_val_data(val_q, val_c, val_a) # vocab is the mapping from word -> token id # rev_vocab is the reverse mapping, from id -> word vocab, rev_vocab = initialize_vocab(vocab_path) # someone posted that the max length of question is 766 info = (question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) ''' batch_gen = batch_generator(question_path, context_path, answer_path, FLAGS.batch_size, FLAGS.max_length, FLAGS.output_size) i = 0; while True: batch_gen.next() i += 1 logging.info(i) ''' encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, batch_generator, info, save_train_dir, val_data, rev_vocab)
def main(FLAGS): print(80 * "=") print("INITIALIZING") print(80 * "=") # Do what you need to load datasets from FLAGS.data_dir #parser, embeddings, train_examples, dev_set, test_set = load_and_preprocess_data(debug) if not os.path.exists('./data/weights/'): os.makedirs('./data/weights/') embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) print("Loading Embedding Matrix") embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.output_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS, embeddings) #qa = QASystem(encoder, FLAGS, embeddings) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) print("Building Network ... ") initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) print("Load Training Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='train', debugMode=True) # encoder.encode_question( # dataset['Questions'], question['Questions_masks']) print(80 * "=") print("Training") print(80 * "=") qa.train(sess, dataset, save_train_dir) print("Finished Training") print("Load Validation Data") dataset = initialize_datasets(FLAGS.data_dir, dataset='val', debugMode=True) print(80 * "=") print("Evaluation") print(80 * "=") qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): FLAGS.config = int(sys.argv[1]) load_config(current_config=FLAGS.config) vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.vocab_dim)) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if os.path.exists(global_train_dir): os.unlink(global_train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) dataset = (context_data, question_data, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, train_dir) with tf.Session() as sess: initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): # Do what you need to load datasets from FLAGS.data_dir #dataset = load_data(FLAGS.data_dir) # None dataset = {} num_train = load_data_dq(dataset, 'train', FLAGS.data_dir) num_val = load_data_dq(dataset, 'val', FLAGS.data_dir) load_data_sa(dataset, 'train', FLAGS.data_dir, num_train) load_data_sa(dataset, 'val', FLAGS.data_dir, num_val) trim(dataset['train']) trim(dataset['val']) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) # Session moved upfront to set the ifgpu flag before QASystem with tf.Session() as sess: pass local_device_protos = device_lib.list_local_devices() # 38559755 for x in local_device_protos: if x.device_type == 'GPU': FLAGS.ifgpu = True break qa = QASystem(encoder, decoder, embed_path, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) #print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # tfdbg #with tf.Session() as sess: # pass #sess = tfdbg.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tfdbg.has_inf_or_nan) load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) #qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True) sess.close() #tfdbg
def main(_): dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect small test data test_q_path = pjoin(FLAGS.data_dir, "test.ids.question") test_q_data, test_q_seq_len = pad_sentences(test_q_path, question_max_len) assert not any( test_q_seq_len > test_q_data.shape[1] ), 'Some questions have length greater than max question length' test_c_path = pjoin(FLAGS.data_dir, "test.ids.context") test_c_data, test_c_seq_len = pad_sentences(test_c_path, context_max_len) assert not any( test_c_seq_len > test_c_data.shape[1] ), 'Some contexts have length greater than max context length' test_s_path = pjoin(FLAGS.data_dir, "test.span") test_s_e_id = get_answer_span(test_s_path, context_max_len) dataset['test'] = [ test_q_data, test_q_seq_len, test_c_data, test_c_seq_len, test_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset['test'], save_train_dir, small_data_test=True) #
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) FLAGS.embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) dataset = { "val_context": context_data, "val_questions": question_data, "val_question_uuids": question_uuid_data } # ========= Model-specific ========= # You must change the following code to adjust to your model encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, FLAGS=FLAGS) decoder = Decoder(FLAGS=FLAGS) qa = QASystem(encoder, decoder, FLAGS) with tf.Session() as sess: #train_dir = get_normalized_train_dir(FLAGS.train_dir) train_dir = FLAGS.train_dir print("train_dir: ", train_dir) initialize_model(sess, qa, train_dir) print("Generating Answers") answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir print("Writing to json file") with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def main(_): # TODO # Do what you need to load datasets from FLAGS.data_dir dataset = dict() for dataset_type in ['train', 'val']: with open(os.path.join(FLAGS.data_dir, "%s.ids.context" % dataset_type)) as f: data_context = [ map(int, line.split()) for line in f.read().splitlines() ] with open( os.path.join(FLAGS.data_dir, "%s.ids.question" % dataset_type)) as f: data_question = [ map(int, line.split()) for line in f.read().splitlines() ] with open(os.path.join(FLAGS.data_dir, "%s.span" % dataset_type)) as f: data_span = [ map(int, line.split()) for line in f.read().splitlines() ] dataset[dataset_type] = (data_context, data_question, data_span) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=config) decoder = Decoder(output_size=FLAGS.output_size, config=config) qa = QASystem(encoder, decoder, config=config) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, FLAGS.evaluate, log=True)
def main(_): data_dir = cfg.DATA_DIR vocab, rev_vocab = initialize_vocab(FLAGS.vocab) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.reset_default_graph() encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder, FLAGS.embed) with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(FLAGS.ckpt) initialize_model(sess, qa, load_train_dir) print( '*********************************************************************' ) print( "Welcome! You can use this to explore the behavior of the model.") print( '*********************************************************************' ) while True: print('-------------------') print('Input the context: ') print('-------------------') sentence = raw_input() print('-------------------') print('Input the question: ') print('-------------------') query = raw_input() raw_context = nltk.word_tokenize(sentence) context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize) question = sentence_to_token_ids(query, vocab, tokenizer=nltk.word_tokenize) context_in = mask_input(context, cfg.context_max_len) question_in = mask_input(question, cfg.question_max_len) start, end = qa.answer(sess, [context_in], [question_in]) answer = ' '.join(raw_context[start[0]:end[0] + 1]) print('==========================================') print('ANSWER: {}'.format(answer)) print('==========================================')
def main(_): # Do what you need to load datasets from FLAGS.data_dir # use .readlines() to load file ourselves # use python generator question_path = pjoin(FLAGS.data_dir, "data_train/train_tokenH.txt") paragraph_path = pjoin(FLAGS.data_dir, "data_train/train_tokenP.txt") answer_path = pjoin(FLAGS.data_dir, "data_train/train_index.txt") val_question_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenH.txt") val_paragraph_path = pjoin(FLAGS.data_dir, "data_dev/dev_tokenP.txt") val_answer_path = pjoin(FLAGS.data_dir, "data_dev/dev_index.txt") # for testing # dataset = [(1,1,1), (1,1,1)] dataset = load_dataset(question_path, paragraph_path, answer_path, FLAGS.batch_size) val_dataset = load_dataset(val_question_path, val_paragraph_path, val_answer_path, FLAGS.batch_size) #generate_histograms(dataset) #generate_histograms(val_dataset) # loads embedding FLAGS.embed_path = FLAGS.embed_path or pjoin("data", "sgns.merge.word.npz") vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.txt") vocab, rev_vocab = initialize_vocab( vocab_path) # one is list and one is dict encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(size=FLAGS.state_size, output_size=FLAGS.output_size) qa = QASystem(encoder, decoder, FLAGS) # log file if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # start training with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, val_dataset, save_train_dir, rev_vocab)
def main(_): # TODO maybe pass as loaded dataset abstraction instead of # file_paths? default_hparams = create_hparams(FLAGS) context_file_path = FLAGS.data_dir + '/train.ids.context' question_file_path = FLAGS.data_dir + '/train.ids.question' span_file_path = FLAGS.data_dir + '/train.span' dataset = (context_file_path, question_file_path, span_file_path) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) default_hparams.add_hparam('vocab_size', len(vocab)) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, default_hparams) # Setup embeddings embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) np_embeddings = np.float32(np.load(embed_path)['glove']) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) session_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Session(config=session_config) as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir, np_embeddings) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) # print(config.question_train) embed_path = config.embed_path vocab_path = config.vocab_path # print(config.embed_path, config.vocab_path) vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_size) decoder = Decoder(config.hidden_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) # train process # qa.train(sess, [train, dev], config.train_dir) # em = qa.evaluate_model(sess, dev) # run process while True: question = input('please input question: ') if question == 'exit': break raw_context = input('please input context: ') if raw_context == 'exit': break question = [ vocab[x] if x in vocab.keys() else 2 for x in question.split() ] context = [ vocab[x] if x in vocab.keys() else 2 for x in raw_context.split() ] test = [[question], [context], [[1, 2]]] a_s, a_e = qa.answer(sess, test) if a_e == a_s: print("answer: ", raw_context.split()[a_s[0]]) else: print("answer: ", ' '.join(raw_context.split()[a_s[0]:a_e[0] + 1]))
def main(_): # Do what you need to load datasets from FLAGS.data_dir datasetTrain = initialize_datasets(FLAGS.data_dir, 'train.', debugMode=False) datasetVal = initialize_datasets(FLAGS.data_dir, 'val.', debugMode=False) datasetTrain.extend(datasetVal) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) #This is taking a long time tic = datetime.now() qa = QASystem(encoder, decoder, embed_path, FLAGS, rev_vocab) print('Time to setup the model: ', datetime.now() - tic) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) #saver = tf.train.Saver() with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) # Get directory to save model #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) results_path = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) save_train_dir = results_path + "model.weights/" if not os.path.exists(save_train_dir): os.makedirs(save_train_dir) qa.train(sess, datasetTrain, save_train_dir) #, saver) qa.evaluate_answer(sess, datasetVal, rev_vocab, sample=1000, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = load_dataset() embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, embedding_size=FLAGS.embedding_size, output_size=FLAGS.output_size) decoder = Decoder(state_size=FLAGS.state_size, output_size=FLAGS.output_size) qa_args = { "embed_path": embed_path, "embedding_size": FLAGS.embedding_size, "output_size": FLAGS.output_size, "optimizer": FLAGS.optimizer, "learning_rate": FLAGS.learning_rate, "epochs": FLAGS.epochs, "batch_size": FLAGS.batch_size, "max_gradient_norm": FLAGS.max_gradient_norm, "dropout_keep_prob": 1.0 - FLAGS.dropout, "train_dir": FLAGS.train_dir, "state_size": FLAGS.state_size } qa = QASystem(encoder, decoder, **qa_args) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {"train": load_data(FLAGS.data_dir, mode="train"), \ "val": load_data(FLAGS.data_dir, mode="val")} embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) matcher = Matcher(perspective_dim=25, input_size=FLAGS.state_size) # add flag decoder = Decoder(output_size=FLAGS.output_size, state_size=FLAGS.state_size, n_perspective_dim=50 * 2) # add flag qa = QASystem(encoder, matcher, decoder, \ vocab=vocab, vocab_dim=FLAGS.embedding_size, rev_vocab=rev_vocab, embed_path=embed_path) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) tf.global_variables_initializer().run() graph_writer = tf.summary.FileWriter("qa-graph") graph_writer.add_graph(sess.graph) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, 500, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir training_question_data_path = pjoin(FLAGS.data_dir, 'train.question') dataset = load_dataset(FLAGS.data_dir) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) pretrained_embeddings = np.load(embed_path)['glove'] encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, pretrained_embeddings=pretrained_embeddings, max_question_length=FLAGS.max_question_length, max_context_length=FLAGS.max_context_length) decoder = Decoder(output_size=FLAGS.output_size, size=FLAGS.state_size, max_context_length=FLAGS.max_context_length) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def main(_): dataset = [ load_dataset([ "embedding/train.ids.question", "embedding/train.ids.context", "data/train.span" ]), load_dataset([ "embedding/val.ids.question", "embedding/val.ids.context", "data/val.span" ]) ] embed_path = pjoin("embedding", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) tf.reset_default_graph() encoder = Encoder(FLAGS.num_hidden_unit, tf.contrib.rnn.GRUCell) decoder = Decoder(FLAGS.num_hidden_unit, tf.contrib.rnn.BasicLSTMCell) embedding = np.load(embed_path)["glove"] qa = QASystem(encoder, decoder, embedding, FLAGS.keep_prob) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, FLAGS.train_dir) qa.train(sess, dataset, FLAGS.epochs, FLAGS.batch_size, FLAGS.train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) with np.load(embed_path) as data: glove_embeddings = np.asfarray(data["glove"], dtype=np.float32) dataset = load_and_preprocess_data() # print(train_data) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, config=FLAGS) decoder = Decoder(output_size=FLAGS.output_size, config=FLAGS) qa = QASystem(encoder, decoder, embeddings=glove_embeddings, config=FLAGS, vocab=(vocab, rev_vocab)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir) qa.evaluate_answer(sess, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = None vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.size) deocder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, deocder) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if os.path.exists(global_train_dir): os.unlink(global_train_dir) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: initialize_model(sess, qa, train_dir) qa.train(sess, dataset) qa.evaluate_answer(sess, dataset, vocab, FLAGS.evaluate, log=True)
def run_func(): config = Config() train = squad_dataset(config.question_train, config.context_train, config.answer_train) dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev) embed_path = config.embed_path vocab_path = config.vocab_path vocab, rev_vocab = initialize_vocab(vocab_path) embeddings = get_trimmed_glove_vectors(embed_path) encoder = Encoder(config.hidden_state_size) decoder = Decoder(config.hidden_state_size) qa = QASystem(encoder, decoder, embeddings, config) with tf.Session() as sess: # ====== Load a pretrained model if it exists or create a new one if no pretrained available ====== qa.initialize_model(sess, config.train_dir) qa.train(sess, [train, dev], config.train_dir)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = get_dataset(FLAGS.data_dir, FLAGS.max_question_size, FLAGS.max_paragraph_size) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) FLAGS.embed_path = embed_path vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size, FLAGS=FLAGS) decoder = Decoder(FLAGS=FLAGS) qa = QASystem(encoder, decoder, FLAGS) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: #load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) #Change these back for final submission load_train_dir = FLAGS.load_train_dir or FLAGS.train_dir print("load_train_dir: ", load_train_dir) initialize_model(sess, qa, load_train_dir) #save_train_dir = get_normalized_train_dir(FLAGS.train_dir) #Change back for final submission save_train_dir = FLAGS.train_dir print("save_train_dir: ", save_train_dir) qa.train(sess, dataset, save_train_dir, rev_vocab)
def main(_): FLAGS.config = int(sys.argv[1]) load_config(current_config=FLAGS.config) # Do what you need to load datasets from FLAGS.data_dir dataset = load_data(FLAGS.data_dir) # ((question, context), answer) train_data = preprocess_dataset(dataset['train'], FLAGS.output_size, FLAGS.question_size) val_data = preprocess_dataset(dataset['val'], FLAGS.output_size, FLAGS.question_size) # print(dataset) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log" + '_config_' + str(FLAGS.config) + ".txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags" + '_config_' +\ str(FLAGS.config) + ".json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, train_data, val_data, save_train_dir) qa.evaluate_answer(sess, train_data, val_data, FLAGS.evaluate, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = [] dataset.append(pjoin(FLAGS.data_dir, "train.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "train.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "train.span")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.question")) dataset.append(pjoin(FLAGS.data_dir, "val.ids.context")) dataset.append(pjoin(FLAGS.data_dir, "val.span")) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.paragraph_output_size) qa = QASystem(encoder, decoder) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) qa.train(sess, dataset, save_train_dir)
def main(_): config = Config() dataset = None # TODO ;load dateset ??? - look at dataset and seenhow it loooks - change model.py accordingly embed_path = config.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(config.embed_size)) embeddingz = np.load(embed_path) embeddings = embeddingz['glove'] embeddingz.close() vocab_len = embeddings.shape[0] train = load_squad(config.data_dir, "train", vocab_len, config.data_dir, max_samples=config.max_train_samples) val = load_squad(config.data_dir, "val", vocab_len, config.data_dir, max_samples=config.max_val_samples) print('train size: ', len(train), ' val size: ', len(val)) vocab_path = config.vocab_path or pjoin(config.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # print_samples(train,1, rev_vocab) #train is triplets of (context,question,answer) encoder = Encoder(state_size=config.hidden_size, embedding_size=config.embed_size) decoder = Decoder(state_size=config.hidden_size, embedding_size=config.embed_size) qa = QASystem(encoder, decoder) with tf.Session() as sess: load_train_dir = (config.load_train_dir or config.train_dir) # put "" here if you want to build a new model initialize_model(sess, qa, load_train_dir) save_train_dir = config.train_dir ds_train = qa.pad_sequences(train) ret_q, ret_p, ret_labels = ds_train qa.train(sess, ds_train, save_train_dir) ds_val = qa.pad_sequences(val) print('train error') qa.evaluate_answer(sess, ds_train, log=True) print('val error') qa.evaluate_answer(sess, ds_val, log=True)
def main(_): # Do what you need to load datasets from FLAGS.data_dir dataset = {} question_max_len = 40 context_max_len = 600 # Preprocess and collect train data train_q_path = pjoin(FLAGS.data_dir, "train.ids.question") train_q_data, train_q_seq_len = pad_sentences(train_q_path, question_max_len) assert not any( train_q_seq_len > train_q_data.shape[1] ), 'Some questions have length greater than max question length' train_c_path = pjoin(FLAGS.data_dir, "train.ids.context") train_c_data, train_c_seq_len = pad_sentences(train_c_path, context_max_len) assert not any( train_c_seq_len > train_c_data.shape[1] ), 'Some contexts have length greater than max context length' train_s_path = pjoin(FLAGS.data_dir, "train.span") train_s_e_id = get_answer_span(train_s_path, context_max_len) dataset['train'] = [ train_q_data, train_q_seq_len, train_c_data, train_c_seq_len, train_s_e_id ] # Preprocess and collect validation data val_q_path = pjoin(FLAGS.data_dir, "val.ids.question") val_q_data, val_q_seq_len = pad_sentences(val_q_path, question_max_len) val_c_path = pjoin(FLAGS.data_dir, "val.ids.context") val_c_data, val_c_seq_len = pad_sentences(val_c_path, context_max_len) val_s_path = pjoin(FLAGS.data_dir, "val.span") val_s_e_id = get_answer_span(val_s_path, context_max_len) dataset['val'] = [ val_q_data, val_q_seq_len, val_c_data, val_c_seq_len, val_s_e_id ] embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = Encoder(state_size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(state_size=FLAGS.state_size) qa = QASystem(encoder, decoder, question_max_len, context_max_len, embed_path, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.dropout, FLAGS.optimizer, FLAGS.max_gradient_norm) # try without dropout if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) for i in range(FLAGS.epochs): qa.train(sess, dataset['train'], save_train_dir) # print('Finish training epoch {}'.format(i)) qa.evaluate_answer(sess, dataset['val']) # vocab, FLAGS.evaluate
def main(_): # Reads file name for paragraph, parses paragraphs, changes form to list of lists of word embeddings # and stores length of each paragraph in paraLens. Next 5 blocks of code do the same for questions and get the indices # of answers within the paragraph. Also, does the same for all three elements of the validation dataset. paraFileName = pjoin(FLAGS.data_dir, "train") + ".ids.context" paraFile = open(paraFileName, "r") paras = [[int(a) for a in b.strip().split(" ") if a != ''] for b in paraFile.read().strip().split("\n")] paras, paraLens = zip(*[padList(para, FLAGS.paraLen) for para in paras]) questionFileName = pjoin(FLAGS.data_dir, "train") + ".ids.question" questionFile = open(questionFileName, "r") questions = [[int(a) for a in b.strip().split(" ") if a != ''] for b in questionFile.read().strip().split("\n")] questions, questionLens = zip( *[padList(question, FLAGS.qLen) for question in questions]) ansFileName = pjoin(FLAGS.data_dir, "train") + ".span" ansFile = open(ansFileName, "r") ans = [[int(a) for a in b.strip().split(" ") if a != ''] for b in ansFile.read().strip().split("\n")] paraFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.context" paraFileVal = open(paraFileNameVal, "r") parasVal = [[int(a) for a in b.strip().split(" ") if a != ''] for b in paraFileVal.read().strip().split("\n")] parasVal, paraLensVal = zip( *[padList(para, FLAGS.paraLen) for para in parasVal]) questionFileNameVal = pjoin(FLAGS.data_dir, "val") + ".ids.question" questionFileVal = open(questionFileNameVal, "r") questionsVal = [[int(a) for a in b.strip().split(" ") if a != ''] for b in questionFileVal.read().strip().split("\n")] questionsVal, questionLensVal = zip( *[padList(question, FLAGS.qLen) for question in questionsVal]) ansFileNameVal = pjoin(FLAGS.data_dir, "val") + ".span" ansFileVal = open(ansFileNameVal, "r") ansVal = [[int(a) for a in b.strip().split(" ") if a != ''] for b in ansFileVal.read().strip().split("\n")] train_remove = [] val_remove = [] for i in range(len(ans)): if ans[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen: train_remove.append(i) for i in range(len(ansVal)): if ansVal[i][0] >= FLAGS.paraLen or ans[i][1] >= FLAGS.paraLen: val_remove.append(i) train_remove = list(reversed(train_remove)) val_remove = list(reversed(val_remove)) paras = list(paras) questions = list(questions) paraLens = list(paraLens) questionLens = list(questionLens) parasVal = list(parasVal) questionsVal = list(questionsVal) paraLensVal = list(paraLensVal) questionLensVal = list(questionLensVal) for i in range(len(train_remove)): ind = train_remove[i] paras.pop(ind) questions.pop(ind) paraLens.pop(ind) questionLens.pop(ind) ans.pop(ind) for i in range(len(val_remove)): ind = val_remove[i] parasVal.pop(ind) questionsVal.pop(ind) paraLensVal.pop(ind) questionLensVal.pop(ind) ansVal.pop(ind) paras = tuple(paras) questions = tuple(questions) paraLens = tuple(paraLens) questionLens = tuple(questionLens) parasVal = tuple(parasVal) questionsVal = tuple(questionsVal) paraLensVal = tuple(paraLensVal) questionLensVal = tuple(questionLensVal) #valDataset has paragraphs and questions as word embeddings, answers as the start and end indices in the #paragraph and length of each paragraph and question for the validation dataset valDataset = (parasVal, questionsVal, ansVal, paraLensVal, questionLensVal) #dataset is the same format as valDataset but for the training dataset. It also has valDataset as the last element dataset = (paras, questions, ans, paraLens, questionLens, valDataset) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) encoder = CoattentionEncoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) #encoder = Encoder(size=FLAGS.state_size, vocab_dim=FLAGS.embedding_size) decoder = Decoder(output_size=FLAGS.output_size) #decoder = ChunkDecoder(output_size=FLAGS.output_size) embeddings = np.load(embed_path)['glove'] #print(embeddings.keys()) qa = QASystem(encoder, decoder, embeddings, vocab, rev_vocab) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) saver = tf.train.Saver() with tf.Session() as sess: load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) initialize_model(sess, qa, load_train_dir, saver) save_train_dir = get_normalized_train_dir(FLAGS.train_dir) sample_rates = [1, 2, 5, 10] for rate in sample_rates: print("rate: ", rate) f1_l = [] em_l = [] for i in range(10): f1, em = qa.evaluate_answer(sess, valDataset, sample=len(valDataset[0]), sample_rate=rate) f1_l.append(f1) em_l.append(em) #print("f1: ", f1, "; em: ", em) print(rate) print(f1_l) print(em_l)
def main(_): '''Check the Config.py to set up models pathes to be ensembled.''' data_dir = cfg.DATA_DIR set_names = cfg.set_names suffixes = cfg.suffixes dataset = mask_dataset(data_dir, set_names, suffixes) raw_answers = read_raw_answers(data_dir) vocab_path = pjoin(data_dir, cfg.vocab_file) vocab, rev_vocab = initialize_vocab(vocab_path) if not os.path.exists(cfg.log_dir): os.makedirs(cfg.log_dir) if not os.path.exists(cfg.cache_dir): os.makedirs(cfg.cache_dir) if not os.path.exists(cfg.fig_dir): os.makedirs(cfg.fig_dir) c_time = time.strftime('%Y%m%d_%H%M', time.localtime()) file_handler = logging.FileHandler( pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt')) logging.getLogger().addHandler(file_handler) model_pathes = cfg.model_pathes num_m = len(model_pathes) train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32) val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32) # gpu setting config = tf.ConfigProto() config.gpu_options.allow_growth = True for i in xrange(num_m): tf.reset_default_graph() with tf.Session(config=config) as sess: encoder = Encoder(size=2 * cfg.lstm_num_hidden) decoder = Decoder(output_size=2 * cfg.lstm_num_hidden) qa = QASystem(encoder, decoder) init = tf.global_variables_initializer() sess.run(init) load_train_dir = get_normalized_train_dir(model_pathes[i]) initialize_model(sess, qa, load_train_dir) ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, ensemble=True, training=True, sample=cfg.num_eval) train_s[:, i] = ts train_e[:, i] = te val_s[:, i] = vs val_e[:, i] = ve if i == num_m - 1: # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e]) train_s = bin_count(train_s) train_e = bin_count(train_e) val_s = bin_count(val_s) val_e = bin_count(val_e) qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab, log=True, training=True, sendin=(train_s, train_e, val_s, val_e), sample=cfg.num_eval)