def create_vocabulary(self): base_path = self.base_path tokens_list = [] train_dataset = self.read_raw_lines('%s/train' % base_path) validation_dataset = self.read_raw_lines('%s/validation' % base_path) test_dataset = self.read_raw_lines('%s/test' % base_path) for dataset in [train_dataset, validation_dataset, test_dataset]: inputs = dataset['inputs'] outputs = dataset['outputs'] for inp in inputs: tokens = tokenizer.split_sentence(inp, tokenizer_type=constants.TOKENIZER_NLTK) tokens_list.extend(tokens) for out in outputs: tokens = tokenizer.split_sentence(out, tokenizer_type=constants.TOKENIZER_NLTK) tokens_list.extend(tokens) vocab = utils.unique_vals(tokens_list, min_num_occurences=0) utils.save_lines(vocab, '%s/vocab.txt' % base_path)
def create_dataset(save_dir, data_path, shared_path): print("Loading data from path %s" % data_path) data = json.load(open(data_path)) print("Done loading data") shared_data = json.load(open(shared_path)) print("Done loading shared data from path %s" % shared_path) def count_sums(up_to_idx): total_len = 0 for i in range(0, up_to_idx): total_len += len(shared_data['x'][i]) return total_len idxs = [] xs = [] answer_starts = [] answer_ends = [] indices = [] questions = [] for i in range(len(shared_data['x'])): print("On %s of %s" % (i, len(shared_data['x']))) for j in range(len(shared_data['x'][i])): cur_tokens = shared_data['x'][i][j][0] cur_text = " ".join(cur_tokens) cur_ans_starts, cur_ans_ends = spacy_tokenizer.extract_phrases( cur_text, 2) answer_starts.extend([str(ans) for ans in cur_ans_starts]) answer_ends.extend([str(ans) for ans in cur_ans_ends]) idxs.extend(range(len(idxs), len(idxs) + len(cur_ans_starts))) questions.extend(["<NONE>"] * len(cur_ans_starts)) indices.extend([str(len(xs))] * len(cur_ans_starts)) xs.append('\t'.join(cur_tokens)) idxs = list(map(lambda idx: str(idx), idxs)) utils.save_lines(idxs, '%s/ids.txt' % save_dir) utils.save_lines(questions, '%s/outputs.txt' % save_dir) utils.save_lines(answer_starts, '%s/answer_starts.txt' % save_dir) utils.save_lines(answer_ends, '%s/answer_ends.txt' % save_dir) utils.save_lines(xs, '%s/inputs.txt' % save_dir) utils.save_lines(indices, '%s/indices.txt' % save_dir)
def save_predictions(self, epoch_num, predictions): save_dir = self.config['save_directory'] save_path = '%s/%s' % (save_dir, 'predictions_%s' % epoch_num) utils.save_lines(predictions, save_path)
from torch.autograd import variable dataset_path = 'datasets/newsqa' load_path = 'logs/squad_saved_data_truncated/model_2.pyt7' language_model_loader = LanguageModelLoaderTruncate( dataset_path, tokenizer_type=constants.TOKENIZER_TAB) language_model = torch_utils.load_model(load_path).cuda() language_model.config['save_directory'] = 'logs/newsqa_saved_data' language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, # epoch_num=10, max_length=20) #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, # epoch_num=10, max_length=10) train_predictions = language_trainer.predict( dataset_type=constants.DATASET_TRAIN, epoch_num=10, max_length=10) utils.save_lines( train_predictions, 'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt') utils.save_lines( dev_predictions, 'logs/newsqa_saved_data/dummy5_validation_predictions_epoch_6.txt') utils.save_lines(test_predictions, 'logs/newsqa_saved_data/dummy5_test_predictions_epoch_6.txt')
from models.language_wrapper import LanguageWrapper from helpers import constants import torch from helpers import torch_utils, utils from torch.autograd import variable dataset_path = 'datasets/newsqa_unsupervised_large_verb_filtered' load_path = 'logs/squad_saved_data/model_8.pyt7' # CHANGE THIS TO WHATEVER YOU WANT language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB) language_model = torch_utils.load_model(load_path).cuda() language_model.config['save_directory'] = 'logs/newsqa_unsupervised_saved_data' language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) ##test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, # epoch_num=10, max_length=20, beam_size=5) #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, # epoch_num=10, max_length=10, beam_size=5) train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN, epoch_num=10, max_length=10) utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_6.txt') #utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt') #utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt')
input_max_length=2100)#00) config = utils.load_json(config_path) config['batch_size'] = 25 config['input_max_length'] = data_loader.input_max_length model = IOBModel(config, embeddings=embeddings) model.restore(params_path) num_steps = 0 data_loader.reset_indices() total_predictions = [] num_steps = 0 while True: batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) num_steps += config['batch_size'] print(num_steps) if batch is None: break predictions = model.predict(batch) texts = data_loader.label_vocab.tokens_list(predictions) for i in range(0, len(texts)): cur_input_length = batch['input_lengths'][i] cur_text = texts[i] text_str = " ".join(cur_text[0:cur_input_length]) total_predictions.append(text_str) utils.save_lines(total_predictions, predictions_save_path)
loss, predictions = model.forward(batch) batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) print(predictions) print(loss) if i % 3 == 0: data_loader.reset_indices() total_predictions = [] while True: batch = data_loader.get_batch(constants.DATASET_TEST, config['batch_size']) if batch is None: break predictions = model.predict(batch) texts = data_loader.label_vocab.tokens_list(predictions) for i in range(0, len(texts)): cur_input_length = batch['input_lengths'][i] cur_text = texts[i] text_str = " ".join(cur_text[0:cur_input_length]) total_predictions.append(text_str) utils.save_lines(total_predictions, \ '%s/predictions_test_%s.txt' % (config['save_path'], i)) data_loader.mix_indices() batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
def save_batch_results(batch_list, save_dir='results', f1_thres=0.1): q = [] answerss = [] x = [] predicted_answers = [] best_answers = [] best_f1_scores = [] predicted_f1_scores = [] total_best_f1_scores = [] total_predicted_f1_scores = [] for batch in batch_list: total_best_f1_scores.extend(batch['best_f1_scores']) total_predicted_f1_scores.extend(batch['predicted_f1_scores']) for i in range(0, len(batch['answerss'])): if batch['best_f1_scores'][i] - batch['predicted_f1_scores'][ i] > f1_thres: x.append((' '.join(batch['x'][i][0])).rstrip('\r\n')) predicted_answers.append(batch['predicted_answers'][i]) best_answers.append(batch['best_answers'][i]) best_f1_scores.append(str(batch['best_f1_scores'][i])) predicted_f1_scores.append(str( batch['predicted_f1_scores'][i])) answerss.append( list( map(lambda ans: ans.rstrip('\r\n'), batch['answerss'][i]))) q.append(batch['q'][i]) avg_best_f1_scores = np.mean(total_best_f1_scores) avg_predicted_f1_scores = np.mean(total_predicted_f1_scores) score_msg = ["PREDICTED %s BEST %s NUM_SAMPLES %s NUM_TOTAL %s" % \ (avg_predicted_f1_scores, avg_best_f1_scores, \ len(best_answers), len(total_best_f1_scores))] utils.save_lines(score_msg, "%s/f1_score_comparison.txt" % save_dir) utils.save_lines(best_f1_scores, "%s/best_f1_scores.txt" % save_dir) utils.save_lines(predicted_f1_scores, "%s/predicted_f1_scores.txt" % save_dir) utils.save_lines(best_answers, "%s/best_answers.txt" % save_dir) utils.save_lines(predicted_answers, "%s/predicted_answers.txt" % save_dir) utils.save_lines(x, "%s/paragraphs.txt" % save_dir) utils.save_tabbed_lines(answerss, "%s/gold_answers.txt" % save_dir) utils.save_tabbed_lines(q, "%s/questions.txt" % save_dir)