Пример #1
0
import pickle
from preprocessing.preprocess import answer_span_to_indices

# custom imports
from preprocessing.dataset import Dataset
from network.config import CONFIG
from network.build_model import get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens
# Suppress tensorflow verboseness
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

print("Starting testing on dev file...")
D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
padded_data, (max_length_question,
              max_length_context) = D.load_questions('data/dev.json')
print("Loaded data")

# split padded data per the start index of the question
split_data_pre = dict()
for qas in padded_data:
    first_word = D.index2word[qas["question"][0]].lower()
    if first_word not in split_data_pre:
        split_data_pre[first_word] = []
    split_data_pre[first_word].append(qas)

# Extract data bigger than batch size
split_data = dict()
print("First word frequency:")
for key in split_data_pre.keys():
    if len(split_data_pre[key]) > CONFIG.BATCH_SIZE:
Пример #2
0
import sys
import numpy as np
import tensorflow as tf
import pickle
from functools import reduce
import os
# custom imports
from network.config import CONFIG
from network.build_model import build_model_v2, get_feed_dict, get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens
from preprocessing.dataset import Dataset

D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
padded_data, (max_length_question,
              max_length_context) = D.load_questions(CONFIG.QUESTION_FILE_V2)
print("Loaded data")

tf.reset_default_graph()
embedding = tf.placeholder(
    shape=[len(index2embedding), CONFIG.EMBEDDING_DIMENSION],
    dtype=tf.float32,
    name='embedding_ph')
train_op, loss, s, e = build_model_v2(embedding)

# Blank csv file
root_path = __init__.root_path
results_path = root_path + '/resultsv2'
model_path = root_path + '/modelv2'
open(results_path + '/training_loss_per_batch.csv', 'w').close()
Пример #3
0
from preprocessing.dataset import Dataset
from network.config import CONFIG
from network.build_model import get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens

# Suppress tensorflow verboseness
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
#padded_data_squad1, (max_length_question, max_length_context) = D.load_questions('data/train.json')
#padded_data_validation = padded_data_squad1[(int) (CONFIG.TRAIN_PERCENTAGE*len(padded_data_squad1)):]
#untrained_contexts = [x["context"] for x in padded_data_validation]
#print("Loaded data from squad one")

padded_data_squad2, (max_length_question_squad2, max_length_context_squad2) = D.load_questions('data/train-v2.0.json')
print("padded_data_squad2.len = ",len(padded_data_squad2))
print("Max length from squad 2 q and c: ", max_length_question_squad2, max_length_context_squad2)
print("Loaded data from squad two")
'''
padded_data_untrained = [x for x in padded_data_squad2 if x["context"] in untrained_contexts]
unanswerable_data = [x for x in padded_data_untrained if x["answer_start"]==-1]
answerable_data = [x for x in padded_data_untrained if x["answer_start"]>=0]
print("Number of unanswerable questions: ",len(unanswerable_data))
print("Number of answerable questions: ", len(answerable_data))

padded_data = np.array(padded_data_untrained)
'''
padded_data = np.array(padded_data_squad2)
padded_data = padded_data[(int) ((CONFIG.TRAIN_PERCENTAGE)*len(padded_data_squad2)) : ]
print(padded_data.shape)