def read_dataset(dataset, tier, vocab): """Reads the dataset, extracts context, question, answer, and answer pointer in their own file. Returns the number of questions and answers processed for the dataset""" context_data = [] query_data = [] question_uuid_data = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens] qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens] context_data.append(' '.join(context_ids)) query_data.append(' '.join(qustion_ids)) question_uuid_data.append(question_uuid) return context_data, query_data, question_uuid_data
def read_dataset(dataset, tier, vocab): """Reads the dataset, extracts context, question, answer, and answer pointer in their own file. Returns the number of questions and answers processed for the dataset""" context_data = [] query_data = [] question_uuid_data = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens] qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens] context_data.append(' '.join(context_ids)) query_data.append(' '.join(qustion_ids)) question_uuid_data.append(question_uuid) return context_data, query_data, question_uuid_data
def preprocess_dataset(dataset): qn_uuid_data = [] context_token_data = [] qn_token_data = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing data"): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = unicode(article_paragraphs[pid]['context']) # string context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) context = context.lower() qas = article_paragraphs[pid]['qas'] for qn in qas: question = unicode(qn['question']) question_tokens = tokenize(question) question_uuid = qn['id'] qn_uuid_data.append(question_uuid) context_token_data.append(context_tokens) qn_token_data.append(question_tokens) return qn_uuid_data, context_token_data, qn_token_data
def preprocess_dataset(dataset): """ Note: this is similar to squad_preprocess.preprocess_and_write, but: (1) We only extract the context and question information from the JSON file. We don't extract answer information. This makes this function much simpler than squad_preprocess.preprocess_and_write, because we don't have to convert the character spans to word spans. This also means that we don't have to discard any examples due to tokenization problems. Input: dataset: data read from SQuAD JSON file Returns: qn_uuid_data, context_token_data, qn_token_data: lists of uuids, tokenized context and tokenized questions """ qn_uuid_data = [] context_token_data = [] qn_token_data = [] for articles_id in tqdm(list(range(len(dataset['data']))), desc="Preprocessing data"): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = str(article_paragraphs[pid]['context']) # string # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) # list of strings (lowercase) context = context.lower() qas = article_paragraphs[pid]['qas'] # list of questions # for each question for qn in qas: # read the question text and tokenize question = str(qn['question']) # string question_tokens = tokenize(question) # list of strings # also get the question_uuid question_uuid = qn['id'] # Append to data lists qn_uuid_data.append(question_uuid) context_token_data.append(context_tokens) qn_token_data.append(question_tokens) return qn_uuid_data, context_token_data, qn_token_data
def preprocess_dataset(dataset): """ Note: this is similar to squad_preprocess.preprocess_and_write, but: (1) We only extract the context and question information from the JSON file. We don't extract answer information. This makes this function much simpler than squad_preprocess.preprocess_and_write, because we don't have to convert the character spans to word spans. This also means that we don't have to discard any examples due to tokenization problems. Input: dataset: data read from SQuAD JSON file Returns: qn_uuid_data, context_token_data, qn_token_data: lists of uuids, tokenized context and tokenized questions """ qn_uuid_data = [] context_token_data = [] qn_token_data = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing data"): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = unicode(article_paragraphs[pid]['context']) # string # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) # list of strings (lowercase) context = context.lower() qas = article_paragraphs[pid]['qas'] # list of questions # for each question for qn in qas: # read the question text and tokenize question = unicode(qn['question']) # string question_tokens = tokenize(question) # list of strings # also get the question_uuid question_uuid = qn['id'] # Append to data lists qn_uuid_data.append(question_uuid) context_token_data.append(context_tokens) qn_token_data.append(question_tokens) return qn_uuid_data, context_token_data, qn_token_data
def read_dev_dataset(dev_dataset, tier, vocab): """Reads the dev dataset json file and extracts the input data (context and question vectors) and question uuid data. """ dev_question_data = [] dev_context_data = [] dev_question_uuid_data = [] for articles_id in tqdm(range(len(dev_dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dev_dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] context_ids = [ int(vocab.get(w, UNK_ID)) for w in context_tokens ] question_ids = [ int(vocab.get(w, UNK_ID)) for w in question_tokens ] dev_question_datum = np.array([question_ids[i] if i < len(question_ids) \ else 0 for i in xrange(FLAGS.question_seq_length)]) dev_question_data.append(dev_question_datum) dev_context_datum = np.array([context_ids[i] if i < len(context_ids) \ else 0 for i in xrange(FLAGS.context_seq_length)]) dev_context_data.append(dev_context_datum) dev_question_uuid_data.append(question_uuid) dev_question_data = np.array(dev_question_data) dev_context_data = np.array(dev_context_data) dev_question_uuid_data = np.array(dev_question_uuid_data) return dev_question_data, dev_context_data, dev_question_uuid_data
def get_raw_tokens(dataset, tier, vocab, rev_vocab, embeddings): vocab2 = {} context_maps = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens, _, _ = tokenize(context, tokenizer=FLAGS.tokenizer) for token in context_tokens: vocab2[token] = 1 context_map = { 'context_tokens': context_tokens, 'question_maps': [] } qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens, _, _ = tokenize(question, tokenizer=FLAGS.tokenizer) question_uuid = qas[qid]['id'] for token in question_tokens: vocab2[token] = 1 question_map = { 'question_tokens': question_tokens, 'question_uuid': question_uuid } context_map['question_maps'].append(question_map) context_maps.append(context_map) if FLAGS.word_lookup: missing_words = find_missing_words(vocab2, vocab) vocab, rev_vocab, embeddings, _ = adu.enhance_vocabulary( vocab, rev_vocab, embeddings, missing_words) return context_maps, vocab, rev_vocab, embeddings
def get_question_context_data(question_string, context_json_file): context_string = data_from_json(context_json_file)['context'] context = str(context_string) # string # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) # list of strings (lowercase) context = context.lower() question = str(question_string) # string question_tokens = tokenize(question) # list of strings # also get the question_uuid question_uuid = len(question_tokens) return [question_uuid], [context_tokens], [question_tokens]
def evaluate(self, thresh=0.05): dataset = squad.Squad(train=True) prediction = [] for index, [context, qas] in enumerate(dataset): if index % 100 == 0: print(index) contexts = [] for sentence in sent_tokenizer.tokenize(context): sentence = tokenize(sentence) sentence = [ word if in_vocab(self.vocab, word) else UNK for word in sentence ] contexts.append(" ".join(sentence)) context_vec = self.vectorizer.transform(contexts) for qa in qas: question, answer, answer_start, is_impossible = qa answer_end = answer_start + len(answer) question = [ word if in_vocab(self.vocab, word) else UNK for word in tokenize(question) ] question = " ".join(question) question_vec = self.vectorizer.transform([question]) scores = [ cosine_similarity(question_vec, vec).flatten() for vec in context_vec ] scores = np.asarray(scores).flatten() ranks = np.argsort(scores)[::-1] if scores[ranks[0]] > thresh: prediction.append( is_correct(contexts, ranks[0], answer_start, answer_end)) accuracy = sum(prediction) / len(prediction) print(accuracy)
def compute_vectors(): """ Computes tfidf vectors for the dataset and pickles the vectorizer """ files = ["answer", "question", "context"] types = ["dev", "train"] files = ["{}.{}".format(t, f) for f in files for t in types] files = [(os.path.join("data", file)) for file in files] text = [] for file in files: text.append(open(file, mode='r', encoding='utf8').read()) text = " ".join(text) sentences = text.splitlines() words = tokenize(text) vocab = Counter(words).most_common(10000) vocab = [word for word, count in vocab] vocab.append(UNK) vocab = sorted(vocab) print("Vocab size : ", len(vocab)) with open(vocab_file, mode='w', encoding='utf8') as file: file.write("\n".join(vocab)) # Add more text to aid tf-idf computation print("Processing sentences") base_text = open("baseline/base", encoding='utf8', mode='r').readlines() for sentence in base_text: sentence = tokenize(sentence) sentence = [ word if in_vocab(vocab, word) else UNK for word in sentence ] sentences.append(" ".join(sentence)) print("Fitting vectorizer") vectorizer = TfidfVectorizer().fit(sentences) pickle.dump(vectorizer, open(vector_file, mode='wb'))
def do_shell(model, dev, input_model=None): """ Interactive shell Type a question, write next for the next paragraph or enter a blank for another human's question. Args: model: QA model that has an instance variable 'answer' that returns answer span and takes placeholders question, question_length, paragraph, paragraph_length dev: Development set """ # what is is_training if import_meta_graph checkpoint_dir = os.path.join(FLAGS.train_dir, FLAGS.model_name) vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") vocab, rev_vocab = initialize_vocab(vocab_path) # TODO no logs saver = tf.train.Saver() with tf.Session() as session: if False: # load_meta last_meta = next( reversed( [f for f in os.listdir(checkpoint_dir) if '.meta' in f])) saver = tf.train.import_meta_graph(os.path.join(last_meta)) saver.restore(session, tf.train.latest_checkpoint(checkpoint_dir)) print('HINT: Input as question "next" for next paragraph') while True: original_question, paragraphs, question_lengths, paragraph_lengths, answers = dev.get_batch( 1) for i in itertools.count(): paragraph = reverse_indices(paragraphs[0], rev_vocab) if not i: print('\n') print(paragraph, end='\n\n') question_input = input('QUESTION: ') if question_input == 'next': break elif question_input: question = [ vocab.get(word, UNK_ID) for word in tokenize(question_input) ] question, question_length = pad_sequence( question, FLAGS.max_question_length) questions, question_lengths = [question], [question_length] else: question_words = reverse_indices(original_question[0], rev_vocab) questions = original_question print(question_words) if input_model: #feed into siamese model instead question = feed_dict_inputs[0] question = input_model.run(question) feed_dict = model.fill_feed_dict(questions, paragraphs, question_lengths, paragraph_lengths) if False: #load_meta start, end = session.run([ 'prediction/answer_start:0', 'prediction/answer_end:0' ], feed_dict) start, end = start[0], end[0] else: start, end = session.run(model.answer, feed_dict) start, end = start[0], end[0] answer_idxs = paragraphs[0][start:end + 1] answer_words = ''.join(reverse_indices(answer_idxs, rev_vocab)) print(f'COMPUTER: {answer_words}') if not question_input: start, end = answers[0] correct_answer_idxs = paragraphs[0][start:end + 1] correct_answer = ''.join( reverse_indices(correct_answer_idxs, rev_vocab)) print(f'HUMAN: {correct_answer}') print()
def read_dataset(dataset, tier, vocab): """Reads the dataset, extracts context, question, answer, and answer pointer in their own file. Returns the number of questions and answers processed for the dataset""" context_word_cnt = 0 context_ukn_word_cnt = 0 context_tokens_data = [] context_data = [] question_tokens_data = [] query_data = [] question_uuid_data = [] rand_max = len(vocab.values()) context_lengths = [] if FLAGS.eval_on_train: s_labels = [] e_labels = [] # true_answers = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): context_lengths.append(len(context_tokens)) question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] context_ids = [ vocab.get(w, qa_data.UNK_ID) for w in context_tokens ] question_ids = [ vocab.get(w, qa_data.UNK_ID) for w in question_tokens ] context_word_cnt += len(context_ids) for i in xrange(len(context_ids)): if context_ids[i] == qa_data.UNK_ID: if FLAGS.rand_unknown: context_ids[i] = random.randint(0, rand_max - 1) context_ukn_word_cnt += 1 #print(context_tokens[i]) if FLAGS.rand_unknown: for i in xrange(len(question_ids)): if int(question_ids[i]) == qa_data.UNK_ID: question_ids[i] = str( random.randint(0, rand_max - 1)) context_data.append(context_ids) query_data.append(question_ids) question_uuid_data.append(question_uuid) context_tokens_data.append(context_tokens) question_tokens_data.append(question_tokens) # if FLAGS.eval_on_train: # answer = qas[qid]['answers'][0]['text'].split() # Wrong because qas[qid]['answers'][0]['answer_start'] is the token, not index #s_labels.append(qas[qid]['answers'][0]['answer_start']) #e_labels.append(qas[qid]['answers'][0]['answer_start'] + len(answer) - 1) # remove answer # true_answers.append(answer) #print(sorted(context_lengths)) context_lengths_over = [ context_length > 300 for context_length in context_lengths ] print('+' * 100) print('Percentage of questions with context over context_max_length is: ' + str(sum(context_lengths_over) / len(context_lengths))) print('Percentage of unknow is ' + str(context_ukn_word_cnt / context_word_cnt)) # remove answer # if FLAGS.eval_on_train: # return context_tokens_data, context_data, question_tokens_data, query_data, question_uuid_data, s_labels, e_labels, true_answers return context_tokens_data, context_data, question_tokens_data, query_data, question_uuid_data
def read_dataset(dataset, tier, vocab): """Reads the dataset, extracts context, question, answer, and answer pointer in their own file. Returns the number of questions and answers processed for the dataset""" context_data = [] query_data = [] question_uuid_data = [] context_mask = [] query_mask = [] mask = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) # note this function is added by ourselves if len(context_tokens) > FLAGS.output_size: context_tokens = context_tokens[:FLAGS.output_size] vec = [] vec.extend([True] * len(context_tokens)) if len(context_tokens) < FLAGS.output_size: vec.extend([False] * (FLAGS.output_size - len(context_tokens))) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) # note this part if len(question_tokens) > FLAGS.max_length: question_tokens = question_tokens[:FLAGS.max_length] query_mask.append(len(question_tokens)) question_uuid = qas[qid]['id'] context_ids = [ vocab.get(w, qa_data.UNK_ID) for w in context_tokens ] context_mask.append(len(context_ids)) if len(context_ids) < FLAGS.output_size: context_ids.extend([0] * (FLAGS.output_size - len(context_ids))) qustion_ids = [ vocab.get(w, qa_data.UNK_ID) for w in question_tokens ] if len(qustion_ids) < FLAGS.max_length: qustion_ids.extend([0] * (FLAGS.max_length - len(qustion_ids))) # context_data.append(' '.join(context_ids)) # query_data.append(' '.join(qustion_ids)) context_data.append(context_ids) query_data.append(qustion_ids) question_uuid_data.append(question_uuid) mask.append(vec) return context_data, query_data, question_uuid_data, context_mask, query_mask, mask
def expand_vocab(prefix, dev_filename, vocab, embd, raw_glove, raw_glove_vocab): # Don't check file size, since we could be using other datasets dev_dataset = maybe_download(squad_base_url, dev_filename, prefix) dev_data = data_from_json(os.path.join(prefix, dev_filename)) #context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab) dataset = dev_data context_data = [] query_data = [] question_uuid_data = [] tier = 'dev' new_vocab = {} found = 0 notfound = 0 for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] #context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens] #qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens] #print(context_ids) for w in context_tokens: if not w in vocab: if not w in new_vocab: new_vocab[w] = 1 else: new_vocab[w] += 1 notfound += 1 else: found += 1 for w in question_tokens: if not w in vocab: if not w in new_vocab: new_vocab[w] = 1 else: new_vocab[w] += 1 notfound += 1 else: found += 1 print('found/not found: {}/{}, {}% not found'.format( found, notfound, 100 * notfound / float(found + notfound))) print('New vocabulary:', len(new_vocab)) vocab_list = list(vocab.items()) vn = len(vocab_list) for i in range((len(new_vocab))): vocab_list.append((new_vocab.keys()[i], vn + i)) vocab = dict(vocab_list) rev_vocab = dict([(x, y) for (y, x) in vocab_list]) #context_data.append(' '.join(context_ids)) #query_data.append(' '.join(qustion_ids)) #question_uuid_data.append(question_uuid) #return context_data, question_data, question_uuid_data _, dim = embd.shape new_glove = np.random.randn(len(vocab), dim) new_glove[:vn, :] = embd found = 0 for i in range(vn, vn + (len(new_vocab))): word = vocab_list[i][0] if word in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word] new_glove[i, :] = raw_glove[idx, :] if word.capitalize() in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word.capitalize()] new_glove[i, :] = raw_glove[idx, :] if word.upper() in raw_glove_vocab: found += 1 idx = raw_glove_vocab[word.upper()] new_glove[i, :] = raw_glove[idx, :] #from IPython import embed; embed() print("{} unseen words found embeddings".format(found)) return vocab, rev_vocab, new_glove
def read_write_dataset_(dataset, tier, prefix): """Reads the dataset, extracts context, question, answer, and answer pointer in their own file. Returns the number of questions and answers processed for the dataset""" qn, an = 0, 0 skipped = 0 with open(os.path.join(prefix, tier +'.context'), 'w') as context_file, \ open(os.path.join(prefix, tier +'.question'), 'w') as question_file:#,\ #open(os.path.join(prefix, tier +'.answer'), 'w') as text_file, \ #open(os.path.join(prefix, tier +'.span'), 'w') as span_file: question_uuid_data = [] for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): article_paragraphs = dataset['data'][articles_id]['paragraphs'] for pid in range(len(article_paragraphs)): context = article_paragraphs[pid]['context'] # The following replacements are suggested in the paper # BidAF (Seo et al., 2016) context = context.replace("''", '" ') context = context.replace("``", '" ') context_tokens = tokenize(context) #answer_map = token_idx_map(context, context_tokens) qas = article_paragraphs[pid]['qas'] for qid in range(len(qas)): question = qas[qid]['question'] question_tokens = tokenize(question) question_uuid = qas[qid]['id'] #answers = qas[qid]['answers'] qn += 1 num_answers = range(1) for ans_id in num_answers: # it contains answer_start, text #text = qas[qid]['answers'][ans_id]['text'] #a_s = qas[qid]['answers'][ans_id]['answer_start'] #text_tokens = tokenize(text) #answer_start = qas[qid]['answers'][ans_id]['answer_start'] #answer_end = answer_start + len(text) #last_word_answer = len(text_tokens[-1]) # add one to get the first char try: #a_start_idx = answer_map[answer_start][1] #a_end_idx = answer_map[answer_end - last_word_answer][1] # remove length restraint since we deal with it later context_file.write(' '.join(context_tokens) + '\n') question_file.write(' '.join(question_tokens) + '\n') #text_file.write(' '.join(text_tokens) + '\n') #span_file.write(' '.join([str(a_start_idx), str(a_end_idx)]) + '\n') question_uuid_data.append(question_uuid) except Exception:# as e: skipped += 1 an += 1 print("Skipped {} question/answer pairs in {}".format(skipped, tier)) return qn,an, question_uuid_data