def predict_single_pointer_string(self, inputs):
        " splits on spaces, can handle a list of sentences "
        assert type(inputs) == list or str
        if isinstance(inputs, str):
            inputs = [inputs]
        encoder_inputs, encoder_input_lengths = list(), list()
        for seq in inputs:

            clean_text = strip_and_lower(_regex_(seq))
            prepared_input, length = _prepare_encoder_input_(
                clean_text, self.data.max_seq_length_meta,
                self.data.word2index)
            encoder_inputs.append(prepared_input)
            encoder_input_lengths.append(length)

        input_fn = partial(prediction_input_fn,
                           test_features=(np.array(encoder_inputs,
                                                   dtype=np.int32),
                                          np.array(encoder_input_lengths,
                                                   dtype=np.int32)),
                           batch_size=self.batch_size)

        prediction_gen = self.classifier.predict(input_fn=lambda: input_fn())
        predictions = [
            int(start['question_starts'][0]) for start in prediction_gen
        ]
        results = [
            " ".join(inputs[idx].strip().split()[strt:])
            for idx, strt in enumerate(predictions)
        ]

        if len(results) == 1:
            results = results.pop()

        return results, inputs
Exemplo n.º 2
0
    def generate(self, questions, non_questions, word2index,
                 max_seq_length, max_num_questions, max_num_elements, randomize_num_questions=False):
        " this shits gotta be callable... and graph params are needed for feature generator arguments "

        # to ensure no two questions togther reach the max_seq_length
        filtered_questions = list(filter(lambda x: len(x.split()) <= max_seq_length // max_num_elements, questions))
        shuffle(filtered_questions)

        assert max_num_questions <= max_num_elements - 1, ' need to have at least 1 fewer questions than num element '  # to ensure that there will always be non-questions
        while True:
            if randomize_num_questions:  # if max elements is 3, then we can add 1 or 2 questions.
                    max_num_questions = np.random.randint(1, max_num_elements - 1)

            input_sequence, target_sequence, num_questions, starts_list, stops_list = self._make_sequence(filtered_questions,
                                                                                 non_questions,
                                                                                 max_seq_length=max_seq_length,
                                                                                 num_questions=max_num_questions,
                                                                                 max_num_elements=max_num_elements)

            # if _is_acceptable_(input_sequence, max_seq_length):
            encoder_inputs, encoder_input_lengths = _prepare_encoder_input_(input_sequence, max_seq_length, word2index)
            decoder_inputs, decoder_input_lengths = _prepare_decoder_input_(target_sequence, max_seq_length, word2index)
            target_sequences, target_seq_lengths = _prepare_target_sequences_(target_sequence, max_seq_length, word2index)

            array = partial(np.array, dtype=np.int32)
            features = array(encoder_inputs), array(encoder_input_lengths), array(decoder_inputs), array(decoder_input_lengths)
            labels = array(target_sequences), array(target_seq_lengths), array(num_questions), array(starts_list), array(stops_list)

            yield features, labels
Exemplo n.º 3
0
    def queue_builder(self,
                      questions,
                      non_questions,
                      word2index,
                      max_seq_length,
                      max_num_questions,
                      max_num_elements,
                      randomize_num_questions):
        " This function gets its own process which will run on the side "

        # to ensure no two questions togther reach the max_seq_length
        filtered_questions = list(filter(lambda x: len(x.split()) <= max_seq_length // max_num_elements, questions))
        shuffle(filtered_questions)

        assert max_num_questions <= max_num_elements - 1, ' need to have at least 1 fewer questions than num element '  # to ensure that there will always be non-questions
        while True:
            if not self.data_que.full():
                # if self.initialized:
                    # print("Queue not full. Repleneshing...")
                if randomize_num_questions:  # if max elements is 3, then we can add 1 or 2 questions.
                        max_num_questions = np.random.randint(1, max_num_elements - 1)

                input_sequence, target_sequence, num_questions, starts_list, stops_list = _generate_sequence_(filtered_questions,
                                                                                                                    non_questions,
                                                                                                                    max_seq_length=max_seq_length,
                                                                                                                    num_questions=max_num_questions,
                                                                                                                    max_num_elements=max_num_elements)

                # if _is_acceptable_(input_sequence, max_seq_length):
                encoder_inputs, encoder_input_lengths = _prepare_encoder_input_(input_sequence, max_seq_length, word2index)
                decoder_inputs, decoder_input_lengths = _prepare_decoder_input_(target_sequence, max_seq_length, word2index)
                target_sequences, target_seq_lengths = _prepare_target_sequences_(target_sequence, max_seq_length, word2index)

                array = partial(np.array, dtype=np.int32)
                features = array(encoder_inputs), array(encoder_input_lengths), array(decoder_inputs), array(decoder_input_lengths)
                labels = array(target_sequences), array(target_seq_lengths), array(num_questions), array(starts_list), array(stops_list)

                self.data_que.put((features, labels))

            else:
                # print('Queue full...Sleeping Again... 5 seconds')
                sleep(self.global_sleep_clock)
Exemplo n.º 4
0
    def predict_string(self, inputs):
        " splits on spaces, can handle a list of sentences "
        assert type(inputs) == list or str
        if isinstance(inputs, str):
            inputs = [inputs]

        encoder_inputs, encoder_input_lengths = list(), list()
        for seq in inputs:
            clean_text = strip_and_lower(_regex_(seq))
            prepared_input, length = _prepare_encoder_input_(
                clean_text, self.data.max_seq_length_meta,
                self.data.word2index)
            encoder_inputs.append(prepared_input)
            encoder_input_lengths.append(length)

        input_fn = partial(pred_input_fn,
                           test_features=(np.array(encoder_inputs,
                                                   dtype=np.int32),
                                          np.array(encoder_input_lengths,
                                                   dtype=np.int32)),
                           batch_size=self.batch_size)

        prediction_gen = self.classifier.predict(input_fn=lambda: input_fn())

        predictions = [x for x in prediction_gen]
        results = [
            " ".join(
                convert_int2word(self.data.index2word,
                                 pred['sentence_tokens']))
            for pred in predictions
        ]

        if len(results) == 1:
            results = results.pop()

        return results, inputs
Exemplo n.º 5
0
def create_static_features(questions,
                           non_questions,
                           word2index,
                           size,
                           max_seq_length,
                           max_num_questions,
                           max_num_elements,
                           randomize_num_questions=False):

    encoder_inputs = list()
    encoder_input_lengths = list()
    decoder_inputs = list()
    decoder_input_lengths = list()
    target_sequences = list()
    target_seq_lengths = list()
    num_questions_labels = list()
    start_labels, stop_labels = list(), list()

    # to ensure no two questions togther reach the max_seq_length
    filtered_questions = list(
        filter(
            lambda x: len(x.split()) <= (max_seq_length // max_num_elements),
            questions))

    assert max_num_questions <= max_num_elements - 1, ' need to have at least 1 fewer questions than num element '  # to ensure that there will always be non-questions
    pbar = tqdm(total=size)
    while len(encoder_inputs) < size:

        if randomize_num_questions:  # if max elements is 3, then we can add 1 or 2 questions.
            max_num_questions = np.random.randint(1, max_num_elements - 1)

        input_sequence, target_sequence, num_questions, start, stop = _generate_sequence_(
            filtered_questions,
            non_questions,
            max_seq_length=max_seq_length,
            num_questions=max_num_questions,
            max_num_elements=max_num_elements)
        enc_input, enc_input_len = _prepare_encoder_input_(
            input_sequence, max_seq_length, word2index)
        encoder_inputs.append(enc_input)
        encoder_input_lengths.append(enc_input_len)

        dec_input, dec_input_len = _prepare_decoder_input_(
            target_sequence, max_seq_length, word2index)
        decoder_inputs.append(dec_input)
        decoder_input_lengths.append(
            dec_input_len
        )  # This seems weird, but... its the only thing that works.

        target_seq, target_seq_len = _prepare_target_sequences_(
            target_sequence, max_seq_length, word2index)
        target_sequences.append(target_seq)
        target_seq_lengths.append(target_seq_len)

        num_questions_labels.append([num_questions])
        start_labels.append(start)
        stop_labels.append(stop)
        pbar.update(1)

    pbar.close()

    if not encoder_inputs:
        print('No data, increase size (for now)')
        raise Exception

    ar = partial(np.array, dtype=np.int32)
    features = ar(encoder_inputs), ar(encoder_input_lengths), ar(
        decoder_inputs), ar(decoder_input_lengths)
    labels = ar(target_sequences), ar(target_seq_lengths), ar(
        num_questions_labels), ar(start_labels), ar(stop_labels)

    return features, labels