def process_dev_json_to_files(): # dev_path example data/squad/dev-v1.1.json download_prefix = os.path.dirname(os.path.abspath( FLAGS.dev_path)) # data/squad/ dev_filename = os.path.basename(FLAGS.dev_path) # "dev-v1.1.json" # relative path to save the data print("Downloading datasets into {}".format(download_prefix)) print("Preprocessing datasets into {}".format(FLAGS.data_dir)) if not os.path.exists(download_prefix): os.makedirs(download_prefix) if not os.path.exists(FLAGS.data_dir): os.makedirs(FLAGS.data_dir) maybe_download(squad_base_url, dev_filename, download_prefix, None) # Read data from dev json file dev_data = data_from_json(os.path.join(download_prefix, dev_filename)) # write data out to FLAGS.data_dir location dev_num_questions, dev_num_answers = read_write_dataset( dev_data, 'dev', FLAGS.data_dir) dev_path = os.path.join(FLAGS.data_dir, "dev") ## generate tokens x_dev_dis_path = dev_path + ".ids.context" y_dev_ids_path = dev_path + ".ids.question" qa_data.data_to_token_ids(dev_path + ".context", x_dev_dis_path, FLAGS.vocab_path) qa_data.data_to_token_ids(dev_path + ".question", y_dev_ids_path, FLAGS.vocab_path)
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data, context_text = read_dataset( dev_data, 'dev', vocab) return context_data, question_data, question_uuid_data, context_text
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab) return context_data, question_data, question_uuid_data
def prepare_dev(prefix, dev_filename, vocab, download = False): print("Downloading {}".format(dev_filename)) # Don't check file size, since we could be using other datasets if download: dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) else: dev_dataset = data_from_json(dev_filename) context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab) return context_data, question_data, question_uuid_data
def prepare_dev(prefix, dev_filename, vocab): dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data = read_dataset( dev_data, 'dev', vocab) def normalize(dat): return map(lambda tok: map(int, tok.split()), dat) context_data = normalize(context_data) question_data = normalize(question_data) return context_data, question_data, question_uuid_data
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) # remove answer # if FLAGS.eval_on_train: # context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers = read_dataset(dev_data, 'train', vocab) # return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data = read_dataset( dev_data, 'dev', vocab) return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data
def prepare_dev(prefix, dev_filename, vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) context_data, question_data, question_uuid_data = read_dataset( dev_data, 'dev', vocab) def normalize(dat): return list(map(lambda tok: list(map(int, tok.split())), dat)) context_data = normalize(context_data) question_data = normalize(question_data) return context_data, question_data, question_uuid_data
def expand_vocab(prefix, dev_filename, vocab, embd, raw_glove, raw_glove_vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) #context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab) dataset = dev_data context_data = [] query_data = [] question_uuid_data = [] tier = 'dev' new_vocab = {} found = 0 notfound = 0 for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] #context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens] #qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens] #print(context_ids) for w in context_tokens: if not w in vocab: if not w in new_vocab: new_vocab[w] = 1 else: new_vocab[w] += 1 notfound += 1 else: found += 1 for w in question_tokens: if not w in vocab: if not w in new_vocab: new_vocab[w] = 1 else: new_vocab[w] += 1 notfound += 1 else: found += 1 print('found/not found: {}/{}, {}% not found'.format( found, notfound, 100 * notfound / float(found + notfound))) print('New vocabulary:', len(new_vocab)) vocab_list = list(vocab.items()) vn = len(vocab_list) for i in range((len(new_vocab))): vocab_list.append((new_vocab.keys()[i], vn + i)) vocab = dict(vocab_list) rev_vocab = dict([(x, y) for (y, x) in vocab_list]) #context_data.append(' '.join(context_ids)) #query_data.append(' '.join(qustion_ids)) #question_uuid_data.append(question_uuid) #return context_data, question_data, question_uuid_data _, dim = embd.shape new_glove = np.random.randn(len(vocab), dim) new_glove[:vn, :] = embd found = 0 for i in range(vn, vn + (len(new_vocab))): word = vocab_list[i][0] if word in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word] new_glove[i, :] = raw_glove[idx, :] if word.capitalize() in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word.capitalize()] new_glove[i, :] = raw_glove[idx, :] if word.upper() in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word.upper()] new_glove[i, :] = raw_glove[idx, :] #from IPython import embed; embed() print("{} unseen words found embeddings".format(found)) return vocab, rev_vocab, new_glove