def load_squad(data,
               data_path,
               max_data_count=None,
               max_context_seq_length=None,
               max_question_seq_length=None,
               max_target_seq_length=None):
    if data_path is None:
        return

    if max_data_count is None:
        max_data_count = 10000
    if max_context_seq_length is None:
        max_context_seq_length = 300
    if max_question_seq_length is None:
        max_question_seq_length = 60
    if max_target_seq_length is None:
        max_target_seq_length = 50

    with open(data_path) as file:
        json_data = json.load(file)

        for instance in json_data['data']:
            for paragraph in instance['paragraphs']:
                context = paragraph['context']
                context_wid_list = [
                    w.lower() for w in nltk.word_tokenize(context)
                    if in_white_list(w)
                ]
                if len(context_wid_list) > max_context_seq_length:
                    continue
                qas = paragraph['qas']
                for qas_instance in qas:
                    question = qas_instance['question']
                    question_wid_list = [
                        w.lower() for w in nltk.word_tokenize(question)
                        if in_white_list(w)
                    ]
                    if len(question_wid_list) > max_question_seq_length:
                        continue
                    answers = qas_instance['answers']
                    for answer in answers:
                        ans = answer['text']
                        answer_wid_list = [
                            w.lower() for w in nltk.word_tokenize(ans)
                            if in_white_list(w)
                        ]
                        if len(answer_wid_list) > max_target_seq_length:
                            continue
                        if len(data) < max_data_count:
                            data.append((context, question, ans))

                if len(data) >= max_data_count:
                    break

                break
Exemplo n.º 2
0
    def reply(self, paragraph, question):
        input_paragraph_seq = []
        input_question_seq = []
        input_paragraph_wid_list = []
        input_question_wid_list = []
        input_paragraph_text = paragraph.lower()
        input_question_text = question.lower()
        for word in nltk.word_tokenize(input_paragraph_text):
            if not text_utils.in_white_list(word):
                continue
            idx = 1  # default [UNK]
            if word in self.input_paragraph_word2idx:
                idx = self.input_paragraph_word2idx[word]
            input_paragraph_wid_list.append(idx)
        for word in nltk.word_tokenize(input_question_text):
            if not text_utils.in_white_list(word):
                continue
            idx = 1  # default [UNK]
            if word in self.input_question_word2idx:
                idx = self.input_question_word2idx[word]
            input_question_wid_list.append(idx)
        input_paragraph_seq.append(input_paragraph_wid_list)
        input_question_seq.append(input_question_wid_list)
        input_paragraph_seq = pad_sequences(
            input_paragraph_seq, self.max_encoder_paragraph_seq_length)
        input_question_seq = pad_sequences(
            input_question_seq, self.max_encoder_question_seq_length)
        states_value = self.encoder_model.predict(
            [input_paragraph_seq, input_question_seq])
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, self.target_word2idx['START']] = 1
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = self.decoder_model.predict([target_seq] +
                                                             states_value)

            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_word = self.target_idx2word[sample_token_idx]
            target_text_len += 1

            if sample_word != 'START' and sample_word != 'END':
                target_text += ' ' + sample_word

            if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, self.num_decoder_tokens))
            target_seq[0, 0, sample_token_idx] = 1

            states_value = [h, c]
        return target_text.strip()
Exemplo n.º 3
0
    def reply(self, paragraph, question):
        input_seq = []
        input_emb = []
        input_text = paragraph.lower() + ' question ' + question.lower()
        for word in nltk.word_tokenize(input_text):
            if not in_white_list(word):
                continue
            emb = self.glove_model.encode_word(word)
            input_emb.append(emb)
        input_seq.append(input_emb)
        input_seq = pad_sequences(input_seq, self.max_encoder_seq_length)
        states_value = self.encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, self.target_word2idx['START']] = 1
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)

            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_word = self.target_idx2word[sample_token_idx]
            target_text_len += 1

            if sample_word != 'START' and sample_word != 'END':
                target_text += ' ' + sample_word

            if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, self.num_decoder_tokens))
            target_seq[0, 0, sample_token_idx] = 1

            states_value = [h, c]
        return target_text.strip()
Exemplo n.º 4
0
    def __init__(self,
                 data_set,
                 max_input_vocab_size=None,
                 max_target_vocab_size=None):
        if max_target_vocab_size is None:
            max_target_vocab_size = 5000
        if max_input_vocab_size is None:
            max_input_vocab_size = 5000

        self.data_set = data_set
        self.input_data_samples = []
        self.output_data_samples = []

        self.input_max_seq_length = 0
        self.target_max_seq_length = 0

        input_counter = Counter()
        target_counter = Counter()

        input_data_samples = []
        output_data_samples = []

        for sample in self.data_set.data:
            paragraph, question, answer = sample
            paragraph_word_list = [
                w.lower() for w in nltk.word_tokenize(paragraph)
                if in_white_list(w)
            ]
            question_word_list = [
                w.lower() for w in nltk.word_tokenize(question)
                if in_white_list(w)
            ]
            answer_word_list = [
                w.lower() for w in nltk.word_tokenize(answer)
                if in_white_list(w)
            ]

            input_data = paragraph_word_list + ['Q'] + question_word_list
            output_data = ['START'] + answer_word_list + ['END']

            input_data_samples.append(input_data)
            output_data_samples.append(output_data)

            for w in input_data:
                input_counter[w] += 1
            for w in output_data:
                target_counter[w] += 1

            self.input_max_seq_length = max(self.input_max_seq_length,
                                            len(input_data))
            self.target_max_seq_length = max(self.target_max_seq_length,
                                             len(output_data))

        self.input_word2idx = dict()
        self.target_word2idx = dict()
        for idx, word in enumerate(
                input_counter.most_common(max_input_vocab_size)):
            self.input_word2idx[word[0]] = idx + 2
        for idx, word in enumerate(
                target_counter.most_common(max_target_vocab_size)):
            self.target_word2idx[word[0]] = idx + 1

        self.target_word2idx['UNK'] = 0
        self.input_word2idx['PAD'] = 0
        self.input_word2idx['UNK'] = 1

        self.input_idx2word = dict([
            (idx, word) for word, idx in self.input_word2idx.items()
        ])
        self.target_idx2word = dict([
            (idx, word) for word, idx in self.target_word2idx.items()
        ])

        self.num_input_tokens = len(self.input_idx2word)
        self.num_target_tokens = len(self.target_idx2word)

        input_encoded_data_samples = []
        target_encoded_data_samples = []

        for input_data, output_data in zip(input_data_samples,
                                           output_data_samples):
            input_encoded_data = []
            target_encoded_data = []
            for word in input_data:
                if word in self.input_word2idx:
                    input_encoded_data.append(self.input_word2idx[word])
                else:
                    input_encoded_data.append(1)
            for word in output_data:
                if word in self.target_word2idx:
                    target_encoded_data.append(self.target_word2idx[word])
                else:
                    target_encoded_data.append(0)
            input_encoded_data_samples.append(input_encoded_data)
            target_encoded_data_samples.append(target_encoded_data)

        self.samples = [
            input_encoded_data_samples, target_encoded_data_samples
        ]
Exemplo n.º 5
0
    def __init__(self, dataset, word2emb, embed_size, max_target_vocab_size=None):
        if max_target_vocab_size is None:
            max_target_vocab_size = 5000

        self.dataset = dataset
        self.word2emb = word2emb
        self.input_data_samples = []
        self.output_data_samples = []

        self.input_paragraph_max_seq_length = 0
        self.input_question_max_seq_length = 0
        self.target_max_seq_length = 0

        unknown_emb = np.zeros(shape=embed_size)

        target_counter = Counter()

        input_data_samples = []
        output_data_samples = []

        for sample in self.dataset.data:
            paragraph, question, answer = sample
            paragraph_word_list = [w.lower() for w in nltk.word_tokenize(paragraph) if in_white_list(w)]
            question_word_list = [w.lower() for w in nltk.word_tokenize(question) if in_white_list(w)]
            answer_word_list = [w.lower() for w in nltk.word_tokenize(answer) if in_white_list(w)]

            output_data = ['START'] + answer_word_list + ['END']

            input_data_samples.append([paragraph_word_list, question_word_list])
            output_data_samples.append(output_data)

            for w in output_data:
                target_counter[w] += 1

            self.input_paragraph_max_seq_length = max(self.input_paragraph_max_seq_length, len(paragraph_word_list))
            self.input_question_max_seq_length = max(self.input_question_max_seq_length, len(question_word_list))
            self.target_max_seq_length = max(self.target_max_seq_length, len(output_data))

        self.target_word2idx = dict()
        for idx, word in enumerate(target_counter.most_common(max_target_vocab_size)):
            self.target_word2idx[word[0]] = idx + 1

        self.target_word2idx['UNK'] = 0
        self.target_idx2word = dict([(idx, word) for word, idx in self.target_word2idx.items()])

        self.num_target_tokens = len(self.target_idx2word)

        input_encoded_data_samples = []
        target_encoded_data_samples = []

        for input_data, output_data in zip(input_data_samples, output_data_samples):
            input_paragraph_encoded_data = []
            input_question_encoded_data = []
            target_encoded_data = []
            input_paragraph_data, input_question_data = input_data
            for word in input_question_data:
                if word in self.word2emb:
                    input_question_encoded_data.append(self.word2emb[word])
                else:
                    input_question_encoded_data.append(unknown_emb)
            for word in input_paragraph_data:
                if word in self.word2emb:
                    input_paragraph_encoded_data.append(self.word2emb[word])
                else:
                    input_paragraph_encoded_data.append(unknown_emb)
            for word in output_data:
                if word in self.target_word2idx:
                    target_encoded_data.append(self.target_word2idx[word])
                else:
                    target_encoded_data.append(0)
            input_encoded_data_samples.append([input_paragraph_encoded_data, input_question_encoded_data])
            target_encoded_data_samples.append(target_encoded_data)

        self.samples = [input_encoded_data_samples, target_encoded_data_samples]