def load_datasets(token_to_index, condition_to_index, test_corpus_name=CONTEXT_SENSITIVE_TEST_CORPUS_NAME): # load context_sensitive_test dataset cs_test = load_conditioned_dataset(test_corpus_name, token_to_index, condition_to_index) # load context_free_validation dataset cf_validation = load_context_free_val(token_to_index) # load context sensitive testset for one selected condition condition_mask = cs_test.condition_ids != condition_to_index[ DEFAULT_CONDITION] conditioned_test = Dataset( x=cs_test.x[condition_mask], y=cs_test.y[condition_mask], condition_ids=cs_test.condition_ids[condition_mask]) # get a subset of conditioned_test of the same size as cf_validation; # if there are no so many samples in conditioned_test, use all of the available conditioned_test samples cs_test_one_condition = \ generate_subset(conditioned_test, subset_size=min(cf_validation.x.shape[0], conditioned_test.x.shape[0])) return create_namedtuple_instance( 'EvalMetricsDatasets', cf_validation=cf_validation, cs_test=cs_test, cs_test_one_condition=cs_test_one_condition)
TRAIN_WORD_EMBEDDINGS_LAYER = True # Allow fine-tuning of the word embedding layer during the model training W2V_MODEL_DIR = os.path.join( DATA_DIR, 'w2v_models') # Path to store & load trained word2vec models WORD_EMBEDDING_DIMENSION = 128 # word2vec embedding dimension W2V_WINDOW_SIZE = 10 # word2vec window size, used during the w2v pre-training USE_SKIP_GRAM = True # Use skip-gram word2vec mode. When False, CBOW is used MIN_WORD_FREQ = 1 # Minimum frequency of a word to be used in word2vec pre-calculation # condition inputs. We use five major emotions to condition our model's predictions # original emotions #EMOTIONS_TYPES = create_namedtuple_instance( # 'EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', disgust='disgust') # TODO we have emotions {"0": "neutral", "1": "anger", "2": "joy", "3": "laugh", "4": "disgust"} EMOTIONS_TYPES = create_namedtuple_instance('EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', laugh='laugh', disgust='disgust') DEFAULT_CONDITION = EMOTIONS_TYPES.joy # Default condition to be used during the prediction (if not specified) CONDITION_EMBEDDING_DIMENSION = 128 # Conditions embedding layer dimension to be trained. # NN architecture params ENCODER_DEPTH = 2 # Number of recurrent (GRU) layers for the encoder DECODER_DEPTH = 2 # Number of recurrent (GRU) layers for the decoder HIDDEN_LAYER_DIMENSION = 512 # Dimension for the recurrent layer DENSE_DROPOUT_RATIO = 0.2 # Use dropout with the given ratio before decoder's output # training params INPUT_SEQUENCE_LENGTH = 30 # Input sequence length for the model during the training; INPUT_CONTEXT_SIZE = 3 # Maximum depth of the conversational history to be used in encoder (at least 1) OUTPUT_SEQUENCE_LENGTH = 32 # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
QUESTIONS_CORPUS_NAME = 'context_free_questions' # Context-free questions only path # word embeddings params USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True # Whether to use word2vec to pre-train weights for the embedding layer TRAIN_WORD_EMBEDDINGS_LAYER = True # Allow fine-tuning of the word embedding layer during the model training W2V_MODEL_DIR = os.path.join( DATA_DIR, 'w2v_models') # Path to store & load trained word2vec models WORD_EMBEDDING_DIMENSION = 128 # word2vec embedding dimension W2V_WINDOW_SIZE = 10 # word2vec window size, used during the w2v pre-training USE_SKIP_GRAM = True # Use skip-gram word2vec mode. When False, CBOW is used MIN_WORD_FREQ = 1 # Minimum frequency of a word to be used in word2vec pre-calculation # condition inputs. We use five major emotions to condition our model's predictions EMOTIONS_TYPES = create_namedtuple_instance('EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', sadness='sadness') DEFAULT_CONDITION = EMOTIONS_TYPES.neutral # Default condition to be used during the prediction (if not specified) CONDITION_EMBEDDING_DIMENSION = 128 # Conditions embedding layer dimension to be trained. # NN architecture params ENCODER_DEPTH = 2 # Number of recurrent (GRU) layers for the encoder DECODER_DEPTH = 2 # Number of recurrent (GRU) layers for the decoder HIDDEN_LAYER_DIMENSION = 512 # Dimension for the recurrent layer DENSE_DROPOUT_RATIO = 0.2 # Use dropout with the given ratio before decoder's output # training params INPUT_SEQUENCE_LENGTH = 30 # Input sequence length for the model during the training; INPUT_CONTEXT_SIZE = 3 # Maximum depth of the conversational history to be used in encoder (at least 1) OUTPUT_SEQUENCE_LENGTH = 32 # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
from cakechat.utils.data_structures import create_namedtuple_instance SPECIAL_TOKENS = create_namedtuple_instance( 'SPECIAL_TOKENS', PAD_TOKEN=u'_pad_', UNKNOWN_TOKEN=u'_unk_', START_TOKEN=u'_start_', EOS_TOKEN=u'_end_') DIALOG_TEXT_FIELD = 'text' DIALOG_CONDITION_FIELD = 'condition'
def __init__(self, index_to_token, index_to_condition, training_data_param, validation_data_param, w2v_model_param, model_init_path=None, model_resolver=None, model_name=MODEL_NAME, corpus_name=BASE_CORPUS_NAME, skip_token=SPECIAL_TOKENS.PAD_TOKEN, token_embedding_dim=WORD_EMBEDDING_DIMENSION, train_token_embedding=TRAIN_WORD_EMBEDDINGS_LAYER, condition_embedding_dim=CONDITION_EMBEDDING_DIMENSION, input_seq_len=INPUT_SEQUENCE_LENGTH, input_context_size=INPUT_CONTEXT_SIZE, output_seq_len=OUTPUT_SEQUENCE_LENGTH, hidden_layer_dim=HIDDEN_LAYER_DIMENSION, use_cudnn=USE_CUDNN, dense_dropout_ratio=DENSE_DROPOUT_RATIO, is_reverse_model=False, reverse_model=None, learning_rate=LEARNING_RATE, grad_clip=GRAD_CLIP, batch_size=BATCH_SIZE, epochs_num=EPOCHS_NUM, horovod=None, tensorboard_log_dir=TENSORBOARD_LOG_DIR, log_run_metadata=LOG_RUN_METADATA): """ :param index_to_token: Dict with mapping: tokens indices to tokens :param index_to_condition: Dict with mapping: conditions indicies to conditions values :param training_data_param: Instance of ModelParam, tuple (value, id) where value is a dataset used for training and id is a name this dataset :param validation_data_param: Instance of ModelParam, tuple (value, id) where value is a dataset used for metrics calculation and id is a concatenation of these datasets' names :param w2v_model_param: Instance of ModelParam, tuple (value, id) where value is a word2vec matrix of shape (vocab_size, token_embedding_dim) with float values, used for initializing token embedding layers, and id is the name of word2vec model :param model_init_path: Path to a file with model's saved weights for layers intialization :param model_resolver: Factory that takes model path and returns a file resolver object :param model_name: String prefix that is prepended to automatically generated model's name. The prefix helps distinguish the current experiment from other experiments with similar params. :param corpus_name: File name of the training dataset (included into automatically generated model's name) :param skip_token: Token to skip with masking, usually _pad_ token. Id of this token is inferred from index_to_token dictionary :param token_embedding_dim: Vectors dimensionality of tokens embeddings :param train_token_embedding: Bool value indicating whether to train token embeddings along with other model's weights or keep them freezed during training :param condition_embedding_dim: Vectors dimensionality of conditions embeddings :param input_seq_len: Max number of tokens in the context sentences :param input_context_size: Max number of sentences in the context :param output_seq_len: Max number of tokens in the output sentences :param hidden_layer_dim: Vectors dimensionality of hidden layers in GRU and Dense layers :param dense_dropout_ratio: Float value between 0 and 1, indicating the ratio of neurons that will be randomly deactivated during training to prevent model's overfitting :param is_reverse_model: Bool value indicating the type of model: False (regular model) - predicts response for the given context True (reverse model) - predicts context for the given response (actually, predict the last context sentence for given response and the beginning of the context) - used for calculating Maximim Mutual Information metric :param reverse_model: Trained reverse model used to generate predictions in *_reranking modes :param learning_rate: Learning rate of the optimization algorithm :param grad_clip: Clipping parameter of the optimization algorithm, used to prevent gradient explosion :param batch_size: Number of samples to be used for gradient estimation on each train step :param epochs_num: Number of full dataset passes during train :param horovod: Initialized horovod module used for multi-gpu training. Trains on single gpu if horovod=None :param tensorboard_log_dir: Path to tensorboard logs directory :param log_run_metadata: Set 'True' to profile memory consumption and computation time on tensorboard """ # Calculate batches number in each epoch. # The last batch which may be smaller than batch size is included in this number batches_num_per_epoch = math.ceil(training_data_param.value.x.shape[0] / batch_size) \ if training_data_param.value else None # Create callbacks callbacks = self._create_essential_callbacks(self, horovod) callbacks.extend([ # Custom callback for metrics calculation CakeChatEvaluatorCallback(self, index_to_token, batch_size, batches_num_per_epoch) ]) super(CakeChatModel, self).__init__( model_resolver_factory=model_resolver, metrics_plotter=TensorboardMetricsPlotter(tensorboard_log_dir), horovod=horovod, training_callbacks=callbacks) WithLogger.__init__(self) self._model_name = 'reverse_{}'.format( model_name) if is_reverse_model else model_name self._rnn_class = CuDNNGRU if use_cudnn else partial(GRU, reset_after=True) # tokens params self._index_to_token = index_to_token self._token_to_index = {v: k for k, v in index_to_token.items()} self._vocab_size = len(self._index_to_token) self._skip_token_id = self._token_to_index[skip_token] self._token_embedding_dim = token_embedding_dim self._train_token_embedding = train_token_embedding self._W_init_embedding = \ self._build_embedding_matrix(self._token_to_index, w2v_model_param.value, token_embedding_dim) \ if w2v_model_param.value else None # condition params self._index_to_condition = index_to_condition self._condition_to_index = { v: k for k, v in index_to_condition.items() } self._condition_embedding_dim = condition_embedding_dim # data params self._training_data = training_data_param.value self._validation_data = validation_data_param.value # train params self._batches_num_per_epoch = batches_num_per_epoch self._model_init_path = model_init_path self._horovod = horovod self._optimizer = optimizers.Adadelta(lr=learning_rate, clipvalue=grad_clip) if self._horovod: self._optimizer = horovod.DistributedOptimizer(self._optimizer) # gather model's params that define the experiment setting self._params = create_namedtuple_instance( name='Params', corpus_name=corpus_name, input_context_size=input_context_size, input_seq_len=input_seq_len, output_seq_len=output_seq_len, token_embedding_dim=token_embedding_dim, train_batch_size=batch_size, hidden_layer_dim=hidden_layer_dim, w2v_model=w2v_model_param.id, is_reverse_model=is_reverse_model, dense_dropout_ratio=dense_dropout_ratio, voc_size=len(self._token_to_index), training_data=training_data_param.id, validation_data=validation_data_param.id, epochs_num=epochs_num, optimizer=self._optimizer.get_config()) # profiling params self._run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) if log_run_metadata else None self._run_metadata = tf.RunMetadata() if log_run_metadata else None # parts of computational graph self._models = None # get trained reverse model used for inference self._reverse_model = reverse_model
TEST_DATA_DIR = os.path.join(DATA_DIR, 'quality') # Path to datasets for quality metrics calculation CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set' # Context-free validation set path TEST_CORPUS_NAME = 'context_free_test_set' # Context-free test set path QUESTIONS_CORPUS_NAME = 'context_free_questions' # Context-free questions only path # word embeddings params USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True # Whether to use word2vec to pre-train weights for the embedding layer TRAIN_WORD_EMBEDDINGS_LAYER = True # Allow fine-tuning of the word embedding layer during the model training W2V_MODEL_DIR = os.path.join(DATA_DIR, 'w2v_models') # Path to store & load trained word2vec models WORD_EMBEDDING_DIMENSION = 128 # word2vec embedding dimension W2V_WINDOW_SIZE = 10 # word2vec window size, used during the w2v pre-training USE_SKIP_GRAM = True # Use skip-gram word2vec mode. When False, CBOW is used MIN_WORD_FREQ = 1 # Minimum frequency of a word to be used in word2vec pre-calculation # condition inputs. We use five major emotions to condition our model's predictions EMOTIONS_TYPES = create_namedtuple_instance( 'EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', sadness='sadness') DEFAULT_CONDITION = EMOTIONS_TYPES.neutral # Default condition to be used during the prediction (if not specified) CONDITION_EMBEDDING_DIMENSION = 128 # Conditions embedding layer dimension to be trained. # NN architecture params ENCODER_DEPTH = 2 # Number of recurrent (GRU) layers for the encoder DECODER_DEPTH = 2 # Number of recurrent (GRU) layers for the decoder HIDDEN_LAYER_DIMENSION = 512 # Dimension for the recurrent layer DENSE_DROPOUT_RATIO = 0.2 # Use dropout with the given ratio before decoder's output # training params INPUT_SEQUENCE_LENGTH = 30 # Input sequence length for the model during the training; INPUT_CONTEXT_SIZE = 3 # Maximum depth of the conversational history to be used in encoder (at least 1) OUTPUT_SEQUENCE_LENGTH = 32 # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens BATCH_SIZE = 192 # Default batch size which fits into 8GB of GPU memory SHUFFLE_TRAINING_BATCHES = True # Shuffle training batches in the dataset each epoch
TEST_DATA_DIR = os.path.join(DATA_DIR, 'quality') # Path to datasets for quality metrics calculation CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set' # Context-free validation set path TEST_CORPUS_NAME = 'context_free_test_set' # Context-free test set path QUESTIONS_CORPUS_NAME = 'context_free_questions' # Context-free questions only path # word embeddings params USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True # Whether to use word2vec to pre-train weights for the embedding layer TRAIN_WORD_EMBEDDINGS_LAYER = True # Allow fine-tuning of the word embedding layer during the model training W2V_MODEL_DIR = os.path.join(DATA_DIR, 'w2v_models') # Path to store & load trained word2vec models WORD_EMBEDDING_DIMENSION = 128 # word2vec embedding dimension W2V_WINDOW_SIZE = 10 # word2vec window size, used during the w2v pre-training USE_SKIP_GRAM = True # Use skip-gram word2vec mode. When False, CBOW is used MIN_WORD_FREQ = 1 # Minimum frequency of a word to be used in word2vec pre-calculation # condition inputs. We use five major emotions to condition our model's predictions EMOTIONS_TYPES = create_namedtuple_instance( 'EMOTIONS_TYPES', therapist='thera', client='client') DEFAULT_CONDITION = EMOTIONS_TYPES.therapist # Default condition to be used during the prediction (if not specified) CONDITION_EMBEDDING_DIMENSION = 128 # Conditions embedding layer dimension to be trained. # NN architecture params ENCODER_DEPTH = 2 # Number of recurrent (GRU) layers for the encoder DECODER_DEPTH = 2 # Number of recurrent (GRU) layers for the decoder HIDDEN_LAYER_DIMENSION = 512 # Dimension for the recurrent layer DENSE_DROPOUT_RATIO = 0.2 # Use dropout with the given ratio before decoder's output # training params INPUT_SEQUENCE_LENGTH = 30 # Input sequence length for the model during the training; INPUT_CONTEXT_SIZE = 3 # Maximum depth of the conversational history to be used in encoder (at least 1) OUTPUT_SEQUENCE_LENGTH = 32 # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens BATCH_SIZE = 192 # Default batch size which fits into 8GB of GPU memory SHUFFLE_TRAINING_BATCHES = True # Shuffle training batches in the dataset each epoch