import pickle from preprocessing.preprocess import answer_span_to_indices # custom imports from preprocessing.dataset import Dataset from network.config import CONFIG from network.build_model import get_batch from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens # Suppress tensorflow verboseness os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' print("Starting testing on dev file...") D = Dataset(CONFIG.EMBEDDING_FILE) index2embedding = D.index2embedding padded_data, (max_length_question, max_length_context) = D.load_questions('data/dev.json') print("Loaded data") # split padded data per the start index of the question split_data_pre = dict() for qas in padded_data: first_word = D.index2word[qas["question"][0]].lower() if first_word not in split_data_pre: split_data_pre[first_word] = [] split_data_pre[first_word].append(qas) # Extract data bigger than batch size split_data = dict() print("First word frequency:") for key in split_data_pre.keys(): if len(split_data_pre[key]) > CONFIG.BATCH_SIZE:
import sys import numpy as np import tensorflow as tf import pickle from functools import reduce import os # custom imports from network.config import CONFIG from network.build_model import build_model_v2, get_feed_dict, get_batch from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens from preprocessing.dataset import Dataset D = Dataset(CONFIG.EMBEDDING_FILE) index2embedding = D.index2embedding padded_data, (max_length_question, max_length_context) = D.load_questions(CONFIG.QUESTION_FILE_V2) print("Loaded data") tf.reset_default_graph() embedding = tf.placeholder( shape=[len(index2embedding), CONFIG.EMBEDDING_DIMENSION], dtype=tf.float32, name='embedding_ph') train_op, loss, s, e = build_model_v2(embedding) # Blank csv file root_path = __init__.root_path results_path = root_path + '/resultsv2' model_path = root_path + '/modelv2' open(results_path + '/training_loss_per_batch.csv', 'w').close()
from preprocessing.dataset import Dataset from network.config import CONFIG from network.build_model import get_batch from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens # Suppress tensorflow verboseness os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' D = Dataset(CONFIG.EMBEDDING_FILE) index2embedding = D.index2embedding #padded_data_squad1, (max_length_question, max_length_context) = D.load_questions('data/train.json') #padded_data_validation = padded_data_squad1[(int) (CONFIG.TRAIN_PERCENTAGE*len(padded_data_squad1)):] #untrained_contexts = [x["context"] for x in padded_data_validation] #print("Loaded data from squad one") padded_data_squad2, (max_length_question_squad2, max_length_context_squad2) = D.load_questions('data/train-v2.0.json') print("padded_data_squad2.len = ",len(padded_data_squad2)) print("Max length from squad 2 q and c: ", max_length_question_squad2, max_length_context_squad2) print("Loaded data from squad two") ''' padded_data_untrained = [x for x in padded_data_squad2 if x["context"] in untrained_contexts] unanswerable_data = [x for x in padded_data_untrained if x["answer_start"]==-1] answerable_data = [x for x in padded_data_untrained if x["answer_start"]>=0] print("Number of unanswerable questions: ",len(unanswerable_data)) print("Number of answerable questions: ", len(answerable_data)) padded_data = np.array(padded_data_untrained) ''' padded_data = np.array(padded_data_squad2) padded_data = padded_data[(int) ((CONFIG.TRAIN_PERCENTAGE)*len(padded_data_squad2)) : ] print(padded_data.shape)