def load_conditioned_dataset(corpus_name, token_to_index, condition_to_index, subset_size=None): processed_corpus_path = get_processed_corpus_path(corpus_name) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition') if subset_size: _logger.info( 'Slicing dataset to the first {} entries'.format(subset_size)) dialogs = islice(dialogs, subset_size) train_lines, train_conditions = get_dialog_lines_and_conditions( get_alternated_dialogs_lines(dialogs), text_field_name='text', condition_field_name='condition') tokenized_alternated_train_lines = ProcessedLinesIterator( train_lines, processing_callbacks=[get_tokens_sequence]) # prepare train set x_train, y_train, n_dialogs = transform_lines_to_nn_input( tokenized_alternated_train_lines, token_to_index) condition_ids_train = transform_conditions_to_nn_input( train_conditions, condition_to_index, n_dialogs) return Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)
def load_datasets(token_to_index, condition_to_index, test_corpus_name=CONTEXT_SENSITIVE_TEST_CORPUS_NAME): # load context_sensitive_test dataset cs_test = load_conditioned_dataset(test_corpus_name, token_to_index, condition_to_index) # load context_free_validation dataset cf_validation = load_context_free_val(token_to_index) # load context sensitive testset for one selected condition condition_mask = cs_test.condition_ids != condition_to_index[ DEFAULT_CONDITION] conditioned_test = Dataset( x=cs_test.x[condition_mask], y=cs_test.y[condition_mask], condition_ids=cs_test.condition_ids[condition_mask]) # get a subset of conditioned_test of the same size as cf_validation; # if there are no so many samples in conditioned_test, use all of the available conditioned_test samples cs_test_one_condition = \ generate_subset(conditioned_test, subset_size=min(cf_validation.x.shape[0], conditioned_test.x.shape[0])) return create_namedtuple_instance( 'EvalMetricsDatasets', cf_validation=cf_validation, cs_test=cs_test, cs_test_one_condition=cs_test_one_condition)
def load_context_sensitive_val(token_to_index, condition_to_index): processed_val_corpus_path = get_processed_corpus_path( CONTEXT_SENSITIVE_VAL_CORPUS_NAME) context_sensitive_val_dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_val_corpus_path), text_field_name='text', condition_field_name='condition') context_sensitive_val_dialogs = islice(context_sensitive_val_dialogs, MAX_VAL_LINES_NUM) alternated_context_sensitive_val_dialogs = \ get_alternated_dialogs_lines(context_sensitive_val_dialogs) alternated_context_sensitive_val_lines, alternated_context_sensitive_val_conditions = \ get_dialog_lines_and_conditions(alternated_context_sensitive_val_dialogs, text_field_name='text', condition_field_name='condition') tokenized_alternated_context_sensitive_val_lines = ProcessedLinesIterator( alternated_context_sensitive_val_lines, processing_callbacks=[get_tokens_sequence]) _logger.info( 'Transform context sensitive validation lines to tensor of indexes') x_context_sensitive_val, y_context_sensitive_val, num_context_sensitive_val_dialogs = \ transform_lines_to_nn_input(tokenized_alternated_context_sensitive_val_lines, token_to_index) condition_ids_context_sensitive_val = transform_conditions_to_nn_input( alternated_context_sensitive_val_conditions, condition_to_index, num_context_sensitive_val_dialogs) return Dataset(x=x_context_sensitive_val, y=y_context_sensitive_val, condition_ids=condition_ids_context_sensitive_val)
def _compute_likelihood_of_input_given_output(self, context, candidates, condition_id): # Repeat to get same context for each candidate repeated_context = np.repeat(context, candidates.shape[0], axis=0) reversed_dataset = reverse_nn_input( Dataset(x=repeated_context, y=candidates, condition_ids=None), self._service_tokens_ids) return get_sequence_score(self._reverse_model, reversed_dataset.x, reversed_dataset.y, condition_id)
def load_context_free_val(token_to_index): _logger.info( 'Transform context free validation lines to matrix of indexes') tokenized_validation_lines = get_tokenized_test_lines( CONTEXT_FREE_VAL_CORPUS_NAME, set(token_to_index.keys())) tokenized_validation_lines = tokenized_validation_lines[:MAX_VAL_LINES_NUM] x_validation, y_validation, _ = transform_lines_to_nn_input( tokenized_validation_lines, token_to_index) return Dataset(x=x_validation, y=y_validation, condition_ids=None)
def _load_dataset_without_responses(corpus_name, token_to_index): tokenized_lines = get_tokenized_test_lines(corpus_name, set(token_to_index.keys())) context_tokens_ids = transform_contexts_to_token_ids( lines_to_context(tokenized_lines), token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE, max_contexts_num=len(tokenized_lines)) return Dataset(x=context_tokens_ids, y=None, condition_ids=None)
def generate_subset(dataset, subset_size, random_seed=RANDOM_SEED): # Fix random seed here so that we get the same subsets every time the function is called np.random.seed(random_seed) if subset_size > dataset.x.shape[0]: raise ValueError('Error while generating subset of the validation data: ' 'dataset size ({}) is less than subset size ({})'.format(dataset.x.shape[0], subset_size)) sample_idx = np.random.choice(dataset.x.shape[0], size=subset_size, replace=False) return Dataset( x=dataset.x[sample_idx], y=dataset.y[sample_idx] if dataset.y is not None else None, condition_ids=dataset.condition_ids[sample_idx] if dataset.condition_ids is not None else None)
def reverse_nn_input(dataset, service_tokens): """ Swaps the last utterance of x with y for each x-y pair in the dataset. To handle different length of sequences, everything is filled with pads to the length of longest sequence. """ # Swap last utterance of x with y, while padding with start- and eos-tokens y_output = np.full(dataset.y.shape, service_tokens.pad_token_id, dtype=dataset.y.dtype) for y_output_sample, x_input_sample in zip(y_output, dataset.x[:, -1]): # Write start token at the first index y_output_sample[0] = service_tokens.start_token_id y_output_token_index = 1 for value in x_input_sample: # We should stop at pad tokens in the input sample if value == service_tokens.pad_token_id: break # We should keep last token index with pad, so we can replace it futher with eos-token if y_output_token_index == y_output_sample.shape[-1] - 1: break y_output_sample[y_output_token_index] = value y_output_token_index += 1 # Write eos token right after the last non-pad token in the sample y_output_sample[y_output_token_index] = service_tokens.eos_token_id # Use utterances from y in x while truncating start- and eos-tokens x_output = np.full(dataset.x.shape, service_tokens.pad_token_id, dtype=dataset.x.dtype) for x_output_sample, x_input_sample, y_input_sample in zip( x_output, dataset.x[:, :-1], dataset.y): # Copy all the context utterances except the last one right to the output x_output_sample[:-1] = x_input_sample x_output_token_index = 0 for value in y_input_sample: # Skip start- and eos-tokens from the input sample because we don't need them in X if value in { service_tokens.start_token_id, service_tokens.eos_token_id }: continue # Stop if we already reached the end of output sample (in case the input sample is longer than output) if x_output_token_index == x_output_sample.shape[-1]: break # Fill the tokens of the last utterance in dialog context x_output_sample[-1, x_output_token_index] = value x_output_token_index += 1 return Dataset(x=x_output, y=y_output, condition_ids=dataset.condition_ids)
def load_conditioned_train_set(token_to_index, condition_to_index, train_subset_size=TRAIN_SUBSET_SIZE): processed_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition') if train_subset_size: dialogs = islice(dialogs, train_subset_size) train_lines, train_conditions = get_dialog_lines_and_conditions( get_alternated_dialogs_lines(dialogs), text_field_name='text', condition_field_name='condition') tokenized_alternated_train_lines = ProcessedLinesIterator(train_lines, processing_callbacks=[get_tokens_sequence]) # prepare train set x_train, y_train, n_dialogs = transform_lines_to_nn_input(tokenized_alternated_train_lines, token_to_index) condition_ids_train = transform_conditions_to_nn_input(train_conditions, condition_to_index, n_dialogs) return Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)
def load_datasets(token_to_index, condition_to_index): train = load_conditioned_train_set(token_to_index, condition_to_index) validation = load_context_free_val(token_to_index) questions = load_questions_set(token_to_index) validation_set_size = validation.x.shape[0] train_subset = generate_subset(train, validation_set_size) # prepare conditioned subset defined_condition_mask = train.condition_ids != condition_to_index[DEFAULT_CONDITION] defined_condition_dataset = Dataset( x=train.x[defined_condition_mask], y=train.y[defined_condition_mask], condition_ids=train.condition_ids[defined_condition_mask]) defined_condition_dataset_len = defined_condition_dataset.x.shape[0] defined_condition_subset = generate_subset(defined_condition_dataset, min(validation_set_size, defined_condition_dataset_len)) return train, questions, validation, train_subset, defined_condition_subset
def _slice_condition_data(dataset, condition_id): condition_mask = (dataset.condition_ids == condition_id) return Dataset( x=dataset.x[condition_mask], y=dataset.y[condition_mask], condition_ids=dataset.condition_ids[condition_mask])
def _make_non_conditioned(dataset): return Dataset(x=dataset.x, y=dataset.y, condition_ids=None)