def _load_train_lines(corpus_name=TRAIN_CORPUS_NAME): processed_corpus_path = get_processed_corpus_path(corpus_name) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition') train_lines, _ = get_dialog_lines_and_conditions( get_alternated_dialogs_lines(dialogs), text_field_name='text', condition_field_name='condition') return train_lines
def load_context_sensitive_val(token_to_index, condition_to_index): processed_val_corpus_path = get_processed_corpus_path( CONTEXT_SENSITIVE_VAL_CORPUS_NAME) context_sensitive_val_dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_val_corpus_path), text_field_name='text', condition_field_name='condition') alternated_context_sensitive_val_dialogs = \ get_alternated_dialogs_lines(context_sensitive_val_dialogs) alternated_context_sensitive_val_lines, alternated_context_sensitive_val_conditions = \ get_dialog_lines_and_conditions(alternated_context_sensitive_val_dialogs, text_field_name='text', condition_field_name='condition') tokenized_alternated_context_sensitive_val_lines = ProcessedLinesIterator( alternated_context_sensitive_val_lines, processing_callbacks=[get_tokens_sequence]) _logger.info( 'Transform context sensitive validation lines to tensor of indexes') x_context_sensitive_val, y_context_sensitive_val, num_context_sensitive_val_dialogs = \ transform_lines_to_nn_input(tokenized_alternated_context_sensitive_val_lines, token_to_index) condition_ids_context_sensitive_val = transform_conditions_to_nn_input( alternated_context_sensitive_val_conditions, condition_to_index, num_context_sensitive_val_dialogs) return Dataset(x=x_context_sensitive_val, y=y_context_sensitive_val, condition_ids=condition_ids_context_sensitive_val)
def load_conditioned_dataset(corpus_name, token_to_index, condition_to_index, subset_size=None): processed_corpus_path = get_processed_corpus_path(corpus_name) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition') if subset_size: _logger.info( 'Slicing dataset to the first {} entries'.format(subset_size)) dialogs = islice(dialogs, subset_size) train_lines, train_conditions = get_dialog_lines_and_conditions( get_alternated_dialogs_lines(dialogs), text_field_name='text', condition_field_name='condition') tokenized_alternated_train_lines = ProcessedLinesIterator( train_lines, processing_callbacks=[get_tokens_sequence]) # prepare train set x_train, y_train, n_dialogs = transform_lines_to_nn_input( tokenized_alternated_train_lines, token_to_index) condition_ids_train = transform_conditions_to_nn_input( train_conditions, condition_to_index, n_dialogs) return Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)
def build_index_mappings(corpus_path, max_tokens_num=VOCABULARY_MAX_SIZE, max_conditions_num=MAX_CONDITIONS_NUM, simple_tokenize=SIMPLE_TOKENIZE): if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in tqdm(dialogs): for utterance in dialog: tokens = utterance[TEXT_FIELD_NAME].split() if simple_tokenize else \ get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter.update(tokens) conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception('No default condition "{}" found in the dataset condition list.'.format(DEFAULT_CONDITION)) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
def load_conditioned_train_set(token_to_index, condition_to_index, train_subset_size=TRAIN_SUBSET_SIZE): processed_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition') if train_subset_size: dialogs = islice(dialogs, train_subset_size) train_lines, train_conditions = get_dialog_lines_and_conditions( get_alternated_dialogs_lines(dialogs), text_field_name='text', condition_field_name='condition') tokenized_alternated_train_lines = ProcessedLinesIterator(train_lines, processing_callbacks=[get_tokens_sequence]) # prepare train set x_train, y_train, n_dialogs = transform_lines_to_nn_input(tokenized_alternated_train_lines, token_to_index) condition_ids_train = transform_conditions_to_nn_input(train_conditions, condition_to_index, n_dialogs) return Dataset(x=x_train, y=y_train, condition_ids=condition_ids_train)
def _get_w2v_embedding_matrix_by_corpus_path(processed_train_corpus_path, index_to_token): if USE_PRETRAINED_W2V_EMBEDDINGS_LAYER: _logger.info('Getting train iterator for w2v...') dialogs_for_w2v = load_processed_dialogs_from_json( FileTextLinesIterator(processed_train_corpus_path), text_field_name='text', condition_field_name='condition') _logger.info('Getting text-filtered train iterator...') train_lines_for_w2v = imap(lambda x: x['text'], get_flatten_dialogs(dialogs_for_w2v)) _logger.info('Getting tokenized train iterator...') tokenized_train_lines_for_w2v = ProcessedLinesIterator( train_lines_for_w2v, processing_callbacks=[get_tokens_sequence]) return get_w2v_embedding_matrix(tokenized_train_lines_for_w2v, index_to_token, add_start_end=True) else: return None
def build_index_mappings(corpus_path, max_tokens_num=MAX_TOKENS_NUM, max_conditions_num=MAX_CONDITIONS_NUM): if not is_non_empty_file(corpus_path): raise ValueError( 'Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in dialogs: for utterance in dialog: # Tokenize dialog utterance text and update tokens count tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter += Counter(tokens) # Update conditions count conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [ condition for condition, _ in conditions_counter.most_common(max_conditions_num) ] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception( 'No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
import os import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from cakechat.utils.text_processing import get_processed_corpus_path, load_processed_dialogs_from_json, \ FileTextLinesIterator, get_dialog_lines_and_conditions, ProcessedLinesIterator, get_flatten_dialogs from cakechat.utils.w2v.model import _get_w2v_model as get_w2v_model from cakechat.config import TRAIN_CORPUS_NAME, VOCABULARY_MAX_SIZE, WORD_EMBEDDING_DIMENSION, W2V_WINDOW_SIZE, \ USE_SKIP_GRAM if __name__ == '__main__': processed_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(processed_corpus_path), text_field_name='text', condition_field_name='condition') training_dialogs_lines_for_w2v, _ = get_dialog_lines_and_conditions( get_flatten_dialogs(dialogs), text_field_name='text', condition_field_name='condition') tokenized_training_lines = ProcessedLinesIterator( training_dialogs_lines_for_w2v, processing_callbacks=[str.split]) get_w2v_model(tokenized_lines=tokenized_training_lines, corpus_name=TRAIN_CORPUS_NAME, voc_size=VOCABULARY_MAX_SIZE, vec_size=WORD_EMBEDDING_DIMENSION,