def load_conditioned_dataset(corpus_name,
                             token_to_index,
                             condition_to_index,
                             subset_size=None):
    processed_corpus_path = get_processed_corpus_path(corpus_name)
    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(processed_corpus_path),
        text_field_name='text',
        condition_field_name='condition')
    if subset_size:
        _logger.info(
            'Slicing dataset to the first {} entries'.format(subset_size))
        dialogs = islice(dialogs, subset_size)
    train_lines, train_conditions = get_dialog_lines_and_conditions(
        get_alternated_dialogs_lines(dialogs),
        text_field_name='text',
        condition_field_name='condition')
    tokenized_alternated_train_lines = ProcessedLinesIterator(
        train_lines, processing_callbacks=[get_tokens_sequence])

    # prepare train set
    x_train, y_train, n_dialogs = transform_lines_to_nn_input(
        tokenized_alternated_train_lines, token_to_index)

    condition_ids_train = transform_conditions_to_nn_input(
        train_conditions, condition_to_index, n_dialogs)
    return Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)
def load_datasets(token_to_index,
                  condition_to_index,
                  test_corpus_name=CONTEXT_SENSITIVE_TEST_CORPUS_NAME):
    # load context_sensitive_test dataset
    cs_test = load_conditioned_dataset(test_corpus_name, token_to_index,
                                       condition_to_index)
    # load context_free_validation dataset
    cf_validation = load_context_free_val(token_to_index)

    # load context sensitive testset for one selected condition
    condition_mask = cs_test.condition_ids != condition_to_index[
        DEFAULT_CONDITION]
    conditioned_test = Dataset(
        x=cs_test.x[condition_mask],
        y=cs_test.y[condition_mask],
        condition_ids=cs_test.condition_ids[condition_mask])

    # get a subset of conditioned_test of the same size as cf_validation;
    # if there are no so many samples in conditioned_test, use all of the available conditioned_test samples
    cs_test_one_condition = \
        generate_subset(conditioned_test, subset_size=min(cf_validation.x.shape[0], conditioned_test.x.shape[0]))

    return create_namedtuple_instance(
        'EvalMetricsDatasets',
        cf_validation=cf_validation,
        cs_test=cs_test,
        cs_test_one_condition=cs_test_one_condition)
def load_context_sensitive_val(token_to_index, condition_to_index):
    processed_val_corpus_path = get_processed_corpus_path(
        CONTEXT_SENSITIVE_VAL_CORPUS_NAME)
    context_sensitive_val_dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(processed_val_corpus_path),
        text_field_name='text',
        condition_field_name='condition')
    context_sensitive_val_dialogs = islice(context_sensitive_val_dialogs,
                                           MAX_VAL_LINES_NUM)

    alternated_context_sensitive_val_dialogs = \
        get_alternated_dialogs_lines(context_sensitive_val_dialogs)
    alternated_context_sensitive_val_lines, alternated_context_sensitive_val_conditions = \
        get_dialog_lines_and_conditions(alternated_context_sensitive_val_dialogs,
                                        text_field_name='text', condition_field_name='condition')
    tokenized_alternated_context_sensitive_val_lines = ProcessedLinesIterator(
        alternated_context_sensitive_val_lines,
        processing_callbacks=[get_tokens_sequence])

    _logger.info(
        'Transform context sensitive validation lines to tensor of indexes')
    x_context_sensitive_val, y_context_sensitive_val, num_context_sensitive_val_dialogs = \
        transform_lines_to_nn_input(tokenized_alternated_context_sensitive_val_lines, token_to_index)
    condition_ids_context_sensitive_val = transform_conditions_to_nn_input(
        alternated_context_sensitive_val_conditions, condition_to_index,
        num_context_sensitive_val_dialogs)
    return Dataset(x=x_context_sensitive_val,
                   y=y_context_sensitive_val,
                   condition_ids=condition_ids_context_sensitive_val)
 def _compute_likelihood_of_input_given_output(self, context, candidates,
                                               condition_id):
     # Repeat to get same context for each candidate
     repeated_context = np.repeat(context, candidates.shape[0], axis=0)
     reversed_dataset = reverse_nn_input(
         Dataset(x=repeated_context, y=candidates, condition_ids=None),
         self._service_tokens_ids)
     return get_sequence_score(self._reverse_model, reversed_dataset.x,
                               reversed_dataset.y, condition_id)
def load_context_free_val(token_to_index):
    _logger.info(
        'Transform context free validation lines to matrix of indexes')
    tokenized_validation_lines = get_tokenized_test_lines(
        CONTEXT_FREE_VAL_CORPUS_NAME, set(token_to_index.keys()))
    tokenized_validation_lines = tokenized_validation_lines[:MAX_VAL_LINES_NUM]
    x_validation, y_validation, _ = transform_lines_to_nn_input(
        tokenized_validation_lines, token_to_index)
    return Dataset(x=x_validation, y=y_validation, condition_ids=None)
Exemplo n.º 6
0
def _load_dataset_without_responses(corpus_name, token_to_index):
    tokenized_lines = get_tokenized_test_lines(corpus_name, set(token_to_index.keys()))
    context_tokens_ids = transform_contexts_to_token_ids(
        lines_to_context(tokenized_lines),
        token_to_index,
        INPUT_SEQUENCE_LENGTH,
        INPUT_CONTEXT_SIZE,
        max_contexts_num=len(tokenized_lines))
    return Dataset(x=context_tokens_ids, y=None, condition_ids=None)
Exemplo n.º 7
0
def generate_subset(dataset, subset_size, random_seed=RANDOM_SEED):
    # Fix random seed here so that we get the same subsets every time the function is called
    np.random.seed(random_seed)
    if subset_size > dataset.x.shape[0]:
        raise ValueError('Error while generating subset of the validation data: '
                         'dataset size ({}) is less than subset size ({})'.format(dataset.x.shape[0], subset_size))
    sample_idx = np.random.choice(dataset.x.shape[0], size=subset_size, replace=False)
    return Dataset(
        x=dataset.x[sample_idx],
        y=dataset.y[sample_idx] if dataset.y is not None else None,
        condition_ids=dataset.condition_ids[sample_idx] if dataset.condition_ids is not None else None)
Exemplo n.º 8
0
def reverse_nn_input(dataset, service_tokens):
    """
    Swaps the last utterance of x with y for each x-y pair in the dataset.
    To handle different length of sequences, everything is filled with pads
    to the length of longest sequence.
    """
    # Swap last utterance of x with y, while padding with start- and eos-tokens
    y_output = np.full(dataset.y.shape,
                       service_tokens.pad_token_id,
                       dtype=dataset.y.dtype)
    for y_output_sample, x_input_sample in zip(y_output, dataset.x[:, -1]):
        # Write start token at the first index
        y_output_sample[0] = service_tokens.start_token_id
        y_output_token_index = 1
        for value in x_input_sample:
            # We should stop at pad tokens in the input sample
            if value == service_tokens.pad_token_id:
                break
            # We should keep last token index with pad, so we can replace it futher with eos-token
            if y_output_token_index == y_output_sample.shape[-1] - 1:
                break
            y_output_sample[y_output_token_index] = value
            y_output_token_index += 1
        # Write eos token right after the last non-pad token in the sample
        y_output_sample[y_output_token_index] = service_tokens.eos_token_id

    # Use utterances from y in x while truncating start- and eos-tokens
    x_output = np.full(dataset.x.shape,
                       service_tokens.pad_token_id,
                       dtype=dataset.x.dtype)
    for x_output_sample, x_input_sample, y_input_sample in zip(
            x_output, dataset.x[:, :-1], dataset.y):
        # Copy all the context utterances except the last one right to the output
        x_output_sample[:-1] = x_input_sample
        x_output_token_index = 0
        for value in y_input_sample:
            # Skip start- and eos-tokens from the input sample because we don't need them in X
            if value in {
                    service_tokens.start_token_id, service_tokens.eos_token_id
            }:
                continue
            # Stop if we already reached the end of output sample (in case the input sample is longer than output)
            if x_output_token_index == x_output_sample.shape[-1]:
                break
            # Fill the tokens of the last utterance in dialog context
            x_output_sample[-1, x_output_token_index] = value
            x_output_token_index += 1

    return Dataset(x=x_output, y=y_output, condition_ids=dataset.condition_ids)
Exemplo n.º 9
0
def load_conditioned_train_set(token_to_index, condition_to_index, train_subset_size=TRAIN_SUBSET_SIZE):
    processed_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME)
    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition')
    if train_subset_size:
        dialogs = islice(dialogs, train_subset_size)
    train_lines, train_conditions = get_dialog_lines_and_conditions(
        get_alternated_dialogs_lines(dialogs), text_field_name='text', condition_field_name='condition')
    tokenized_alternated_train_lines = ProcessedLinesIterator(train_lines, processing_callbacks=[get_tokens_sequence])

    # prepare train set
    x_train, y_train, n_dialogs = transform_lines_to_nn_input(tokenized_alternated_train_lines, token_to_index)

    condition_ids_train = transform_conditions_to_nn_input(train_conditions, condition_to_index, n_dialogs)
    return Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)
Exemplo n.º 10
0
def load_datasets(token_to_index, condition_to_index):
    train = load_conditioned_train_set(token_to_index, condition_to_index)
    validation = load_context_free_val(token_to_index)
    questions = load_questions_set(token_to_index)

    validation_set_size = validation.x.shape[0]

    train_subset = generate_subset(train, validation_set_size)

    # prepare conditioned subset
    defined_condition_mask = train.condition_ids != condition_to_index[DEFAULT_CONDITION]
    defined_condition_dataset = Dataset(
        x=train.x[defined_condition_mask],
        y=train.y[defined_condition_mask],
        condition_ids=train.condition_ids[defined_condition_mask])

    defined_condition_dataset_len = defined_condition_dataset.x.shape[0]
    defined_condition_subset = generate_subset(defined_condition_dataset,
                                               min(validation_set_size, defined_condition_dataset_len))

    return train, questions, validation, train_subset, defined_condition_subset
def _slice_condition_data(dataset, condition_id):
    condition_mask = (dataset.condition_ids == condition_id)
    return Dataset(
        x=dataset.x[condition_mask], y=dataset.y[condition_mask], condition_ids=dataset.condition_ids[condition_mask])
def _make_non_conditioned(dataset):
    return Dataset(x=dataset.x, y=dataset.y, condition_ids=None)