示例#1
0
from itertools import imap, islice, izip

import numpy as np

from cakechat.config import BASE_CORPUS_NAME, TRAIN_CORPUS_NAME, WORD_EMBEDDING_DIMENSION, INPUT_CONTEXT_SIZE, \
    HIDDEN_LAYER_DIMENSION, ENCODER_DEPTH, DECODER_DEPTH, INPUT_SEQUENCE_LENGTH, \
    OUTPUT_SEQUENCE_LENGTH, GRAD_CLIP, ADADELTA_LEARNING_RATE, TRAIN_WORD_EMBEDDINGS_LAYER, DATA_DIR, \
    DENSE_DROPOUT_RATIO, USE_SKIP_GRAM, W2V_WINDOW_SIZE, CONDITION_EMBEDDING_DIMENSION, DEFAULT_CONDITION, \
    S3_MODELS_BUCKET_NAME, S3_W2V_REMOTE_DIR
from cakechat.utils.logger import get_logger
from cakechat.utils.s3 import S3FileResolver
from cakechat.utils.tee_file import file_buffered_tee
from cakechat.utils.text_processing import SPECIAL_TOKENS, load_index_to_item, get_index_to_token_path
from cakechat.utils.w2v import get_w2v_model, get_w2v_params_str

_logger = get_logger(__name__)

Dataset = namedtuple('Dataset', ['x', 'y', 'condition_ids'])


def transform_conditions_to_ids(conditions, condition_to_index, n_dialogs):
    condition_ids_iterator = imap(
        lambda condition: condition_to_index.get(condition, condition_to_index[DEFAULT_CONDITION]), conditions)
    condition_ids = np.full(n_dialogs, condition_to_index[DEFAULT_CONDITION], dtype=np.int32)
    for sample_idx, condition_id in enumerate(condition_ids_iterator):
        condition_ids[sample_idx] = condition_id
    return condition_ids


def lines_to_context(tokenized_lines):
    for line in tokenized_lines:
示例#2
0
import codecs
import json
from copy import copy

from cakechat.utils.logger import get_logger

_logger = get_logger(__name__)


class FileTextLinesIterator(object):
    def __init__(self, filename, encoding='utf-8'):
        self._filename = filename
        self._encoding = encoding

    def __iter__(self):
        for line in codecs.open(self._filename, 'r', self._encoding):
            yield line.strip()

    def __copy__(self):
        return FileTextLinesIterator(self._filename)


class ProcessedLinesIterator(object):
    def __init__(self, lines_iter, processing_callbacks=None):
        self._lines_iter = lines_iter
        self._processing_callbacks = processing_callbacks if processing_callbacks else []

    def __iter__(self):
        for line in self._lines_iter:
            for callback in self._processing_callbacks:
                line = callback(line)
示例#3
0
 def __init__(self, bucket_client):
     self._logger = get_logger(__name__)
     self._bucket_client = bucket_client