def load_squad(data, data_path, max_data_count=None, max_context_seq_length=None, max_question_seq_length=None, max_target_seq_length=None): if data_path is None: return if max_data_count is None: max_data_count = 10000 if max_context_seq_length is None: max_context_seq_length = 300 if max_question_seq_length is None: max_question_seq_length = 60 if max_target_seq_length is None: max_target_seq_length = 50 with open(data_path) as file: json_data = json.load(file) for instance in json_data['data']: for paragraph in instance['paragraphs']: context = paragraph['context'] context_wid_list = [ w.lower() for w in nltk.word_tokenize(context) if in_white_list(w) ] if len(context_wid_list) > max_context_seq_length: continue qas = paragraph['qas'] for qas_instance in qas: question = qas_instance['question'] question_wid_list = [ w.lower() for w in nltk.word_tokenize(question) if in_white_list(w) ] if len(question_wid_list) > max_question_seq_length: continue answers = qas_instance['answers'] for answer in answers: ans = answer['text'] answer_wid_list = [ w.lower() for w in nltk.word_tokenize(ans) if in_white_list(w) ] if len(answer_wid_list) > max_target_seq_length: continue if len(data) < max_data_count: data.append((context, question, ans)) if len(data) >= max_data_count: break break
def reply(self, paragraph, question): input_paragraph_seq = [] input_question_seq = [] input_paragraph_wid_list = [] input_question_wid_list = [] input_paragraph_text = paragraph.lower() input_question_text = question.lower() for word in nltk.word_tokenize(input_paragraph_text): if not text_utils.in_white_list(word): continue idx = 1 # default [UNK] if word in self.input_paragraph_word2idx: idx = self.input_paragraph_word2idx[word] input_paragraph_wid_list.append(idx) for word in nltk.word_tokenize(input_question_text): if not text_utils.in_white_list(word): continue idx = 1 # default [UNK] if word in self.input_question_word2idx: idx = self.input_question_word2idx[word] input_question_wid_list.append(idx) input_paragraph_seq.append(input_paragraph_wid_list) input_question_seq.append(input_question_wid_list) input_paragraph_seq = pad_sequences( input_paragraph_seq, self.max_encoder_paragraph_seq_length) input_question_seq = pad_sequences( input_question_seq, self.max_encoder_question_seq_length) states_value = self.encoder_model.predict( [input_paragraph_seq, input_question_seq]) target_seq = np.zeros((1, 1, self.num_decoder_tokens)) target_seq[0, 0, self.target_word2idx['START']] = 1 target_text = '' target_text_len = 0 terminated = False while not terminated: output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) sample_token_idx = np.argmax(output_tokens[0, -1, :]) sample_word = self.target_idx2word[sample_token_idx] target_text_len += 1 if sample_word != 'START' and sample_word != 'END': target_text += ' ' + sample_word if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length: terminated = True target_seq = np.zeros((1, 1, self.num_decoder_tokens)) target_seq[0, 0, sample_token_idx] = 1 states_value = [h, c] return target_text.strip()
def reply(self, paragraph, question): input_seq = [] input_emb = [] input_text = paragraph.lower() + ' question ' + question.lower() for word in nltk.word_tokenize(input_text): if not in_white_list(word): continue emb = self.glove_model.encode_word(word) input_emb.append(emb) input_seq.append(input_emb) input_seq = pad_sequences(input_seq, self.max_encoder_seq_length) states_value = self.encoder_model.predict(input_seq) target_seq = np.zeros((1, 1, self.num_decoder_tokens)) target_seq[0, 0, self.target_word2idx['START']] = 1 target_text = '' target_text_len = 0 terminated = False while not terminated: output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) sample_token_idx = np.argmax(output_tokens[0, -1, :]) sample_word = self.target_idx2word[sample_token_idx] target_text_len += 1 if sample_word != 'START' and sample_word != 'END': target_text += ' ' + sample_word if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length: terminated = True target_seq = np.zeros((1, 1, self.num_decoder_tokens)) target_seq[0, 0, sample_token_idx] = 1 states_value = [h, c] return target_text.strip()
def __init__(self, data_set, max_input_vocab_size=None, max_target_vocab_size=None): if max_target_vocab_size is None: max_target_vocab_size = 5000 if max_input_vocab_size is None: max_input_vocab_size = 5000 self.data_set = data_set self.input_data_samples = [] self.output_data_samples = [] self.input_max_seq_length = 0 self.target_max_seq_length = 0 input_counter = Counter() target_counter = Counter() input_data_samples = [] output_data_samples = [] for sample in self.data_set.data: paragraph, question, answer = sample paragraph_word_list = [ w.lower() for w in nltk.word_tokenize(paragraph) if in_white_list(w) ] question_word_list = [ w.lower() for w in nltk.word_tokenize(question) if in_white_list(w) ] answer_word_list = [ w.lower() for w in nltk.word_tokenize(answer) if in_white_list(w) ] input_data = paragraph_word_list + ['Q'] + question_word_list output_data = ['START'] + answer_word_list + ['END'] input_data_samples.append(input_data) output_data_samples.append(output_data) for w in input_data: input_counter[w] += 1 for w in output_data: target_counter[w] += 1 self.input_max_seq_length = max(self.input_max_seq_length, len(input_data)) self.target_max_seq_length = max(self.target_max_seq_length, len(output_data)) self.input_word2idx = dict() self.target_word2idx = dict() for idx, word in enumerate( input_counter.most_common(max_input_vocab_size)): self.input_word2idx[word[0]] = idx + 2 for idx, word in enumerate( target_counter.most_common(max_target_vocab_size)): self.target_word2idx[word[0]] = idx + 1 self.target_word2idx['UNK'] = 0 self.input_word2idx['PAD'] = 0 self.input_word2idx['UNK'] = 1 self.input_idx2word = dict([ (idx, word) for word, idx in self.input_word2idx.items() ]) self.target_idx2word = dict([ (idx, word) for word, idx in self.target_word2idx.items() ]) self.num_input_tokens = len(self.input_idx2word) self.num_target_tokens = len(self.target_idx2word) input_encoded_data_samples = [] target_encoded_data_samples = [] for input_data, output_data in zip(input_data_samples, output_data_samples): input_encoded_data = [] target_encoded_data = [] for word in input_data: if word in self.input_word2idx: input_encoded_data.append(self.input_word2idx[word]) else: input_encoded_data.append(1) for word in output_data: if word in self.target_word2idx: target_encoded_data.append(self.target_word2idx[word]) else: target_encoded_data.append(0) input_encoded_data_samples.append(input_encoded_data) target_encoded_data_samples.append(target_encoded_data) self.samples = [ input_encoded_data_samples, target_encoded_data_samples ]
def __init__(self, dataset, word2emb, embed_size, max_target_vocab_size=None): if max_target_vocab_size is None: max_target_vocab_size = 5000 self.dataset = dataset self.word2emb = word2emb self.input_data_samples = [] self.output_data_samples = [] self.input_paragraph_max_seq_length = 0 self.input_question_max_seq_length = 0 self.target_max_seq_length = 0 unknown_emb = np.zeros(shape=embed_size) target_counter = Counter() input_data_samples = [] output_data_samples = [] for sample in self.dataset.data: paragraph, question, answer = sample paragraph_word_list = [w.lower() for w in nltk.word_tokenize(paragraph) if in_white_list(w)] question_word_list = [w.lower() for w in nltk.word_tokenize(question) if in_white_list(w)] answer_word_list = [w.lower() for w in nltk.word_tokenize(answer) if in_white_list(w)] output_data = ['START'] + answer_word_list + ['END'] input_data_samples.append([paragraph_word_list, question_word_list]) output_data_samples.append(output_data) for w in output_data: target_counter[w] += 1 self.input_paragraph_max_seq_length = max(self.input_paragraph_max_seq_length, len(paragraph_word_list)) self.input_question_max_seq_length = max(self.input_question_max_seq_length, len(question_word_list)) self.target_max_seq_length = max(self.target_max_seq_length, len(output_data)) self.target_word2idx = dict() for idx, word in enumerate(target_counter.most_common(max_target_vocab_size)): self.target_word2idx[word[0]] = idx + 1 self.target_word2idx['UNK'] = 0 self.target_idx2word = dict([(idx, word) for word, idx in self.target_word2idx.items()]) self.num_target_tokens = len(self.target_idx2word) input_encoded_data_samples = [] target_encoded_data_samples = [] for input_data, output_data in zip(input_data_samples, output_data_samples): input_paragraph_encoded_data = [] input_question_encoded_data = [] target_encoded_data = [] input_paragraph_data, input_question_data = input_data for word in input_question_data: if word in self.word2emb: input_question_encoded_data.append(self.word2emb[word]) else: input_question_encoded_data.append(unknown_emb) for word in input_paragraph_data: if word in self.word2emb: input_paragraph_encoded_data.append(self.word2emb[word]) else: input_paragraph_encoded_data.append(unknown_emb) for word in output_data: if word in self.target_word2idx: target_encoded_data.append(self.target_word2idx[word]) else: target_encoded_data.append(0) input_encoded_data_samples.append([input_paragraph_encoded_data, input_question_encoded_data]) target_encoded_data_samples.append(target_encoded_data) self.samples = [input_encoded_data_samples, target_encoded_data_samples]