from itertools import imap, islice, izip import numpy as np from cakechat.config import BASE_CORPUS_NAME, TRAIN_CORPUS_NAME, WORD_EMBEDDING_DIMENSION, INPUT_CONTEXT_SIZE, \ HIDDEN_LAYER_DIMENSION, ENCODER_DEPTH, DECODER_DEPTH, INPUT_SEQUENCE_LENGTH, \ OUTPUT_SEQUENCE_LENGTH, GRAD_CLIP, ADADELTA_LEARNING_RATE, TRAIN_WORD_EMBEDDINGS_LAYER, DATA_DIR, \ DENSE_DROPOUT_RATIO, USE_SKIP_GRAM, W2V_WINDOW_SIZE, CONDITION_EMBEDDING_DIMENSION, DEFAULT_CONDITION, \ S3_MODELS_BUCKET_NAME, S3_W2V_REMOTE_DIR from cakechat.utils.logger import get_logger from cakechat.utils.s3 import S3FileResolver from cakechat.utils.tee_file import file_buffered_tee from cakechat.utils.text_processing import SPECIAL_TOKENS, load_index_to_item, get_index_to_token_path from cakechat.utils.w2v import get_w2v_model, get_w2v_params_str _logger = get_logger(__name__) Dataset = namedtuple('Dataset', ['x', 'y', 'condition_ids']) def transform_conditions_to_ids(conditions, condition_to_index, n_dialogs): condition_ids_iterator = imap( lambda condition: condition_to_index.get(condition, condition_to_index[DEFAULT_CONDITION]), conditions) condition_ids = np.full(n_dialogs, condition_to_index[DEFAULT_CONDITION], dtype=np.int32) for sample_idx, condition_id in enumerate(condition_ids_iterator): condition_ids[sample_idx] = condition_id return condition_ids def lines_to_context(tokenized_lines): for line in tokenized_lines:
import codecs import json from copy import copy from cakechat.utils.logger import get_logger _logger = get_logger(__name__) class FileTextLinesIterator(object): def __init__(self, filename, encoding='utf-8'): self._filename = filename self._encoding = encoding def __iter__(self): for line in codecs.open(self._filename, 'r', self._encoding): yield line.strip() def __copy__(self): return FileTextLinesIterator(self._filename) class ProcessedLinesIterator(object): def __init__(self, lines_iter, processing_callbacks=None): self._lines_iter = lines_iter self._processing_callbacks = processing_callbacks if processing_callbacks else [] def __iter__(self): for line in self._lines_iter: for callback in self._processing_callbacks: line = callback(line)
def __init__(self, bucket_client): self._logger = get_logger(__name__) self._bucket_client = bucket_client