예제 #1
0
def get_encoder(input_idx, input_one_hot_embeddings, nfilter, z_size,
                intermediate_dim, one_hot_embeddings):
    # oshape = (batch_size, sample_size/2, 128)
    lstm = LSTM(units=300,
                dropout=0.2,
                recurrent_dropout=0.2,
                name='encoding_lstm',
                go_backwards=True,
                implementation=2)(one_hot_embeddings)
    hidden_mean = Dense(z_size, name='mu')(lstm)
    hidden_log_sigma = Dense(z_size, name='sigma')(lstm)

    sampling_object = Sampling(z_size)
    sampling = sampling_object([hidden_mean, hidden_log_sigma])

    discr_encoder = Model(inputs=one_hot_embeddings,
                          outputs=[sampling, hidden_mean, hidden_log_sigma],
                          name='discr_encoder')

    z_p, z_mean, z_sigma = discr_encoder(input_one_hot_embeddings)
    encoder = Model(inputs=input_idx,
                    outputs=[z_p, z_mean, z_sigma],
                    name='encoder')

    return encoder, discr_encoder
예제 #2
0
def get_encoder(config_data, input_idx, input_one_hot_embeddings, nfilter,
                name, z_size):
    intermediate_dim = config_data['intermediate_dim']

    conv1 = Conv1D(filters=nfilter, kernel_size=3, strides=2,
                   padding='same')(input_one_hot_embeddings)
    bn1 = BatchNormalization(scale=False)(conv1)
    relu1 = PReLU()(bn1)
    # oshape = (batch_size, sample_size/4, 128)
    conv2 = Conv1D(filters=2 * nfilter,
                   kernel_size=3,
                   strides=2,
                   padding='same')(relu1)
    bn2 = BatchNormalization(scale=False)(conv2)
    relu2 = PReLU()(bn2)
    # oshape = (batch_size, sample_size/4*256)
    flatten = Flatten()(relu2)
    # need to store the size of the representation after the convolutions -> needed for deconv later
    hidden_intermediate_enc = Dense(intermediate_dim,
                                    name='intermediate_encoding')(flatten)
    hidden_zvalues = Dense(z_size * 2)(hidden_intermediate_enc)

    sampling_object = Sampling(z_size)
    sampling = sampling_object(hidden_zvalues)

    encoder = Model(inputs=input_idx,
                    outputs=sampling,
                    name='encoder_{}'.format(name))

    return encoder, sampling_object
def get_encoder(inputs, name_one_hot_embeddings, near_one_hot_embeddings, nfilter, z_size, intermediate_dim):
    name_idx, eat_type_idx, price_range_idx, customer_feedback_idx, near_idx, food_idx, area_idx, family_idx, _ = inputs

    #name_conv = get_conv_stack(name_one_hot_embeddings, nfilter)
    #near_conv = get_conv_stack(near_one_hot_embeddings, nfilter)

    #name_hidden = Dense(units=16, activation='relu')(name_conv)
    #near_hidden = Dense(units=16, activation='relu')(near_conv)

    full_concat = concatenate(inputs=[name_idx, near_idx, eat_type_idx, price_range_idx, customer_feedback_idx, food_idx, area_idx, family_idx])

    # need to store the size of the representation after the convolutions -> needed for deconv later
    hidden_intermediate_enc = Dense(
        intermediate_dim,
        name='intermediate_encoding'
    )(full_concat)
    hidden_mean = Dense(z_size, name='mu')(hidden_intermediate_enc)
    hidden_log_sigma = Dense(z_size, name='sigma')(hidden_intermediate_enc)

    sampling_object = Sampling(z_size)
    sampling = sampling_object([hidden_mean, hidden_log_sigma])

    encoder = Model(inputs=inputs[:-1], outputs=[sampling, hidden_mean, hidden_log_sigma])
    encoder.summary()

    return encoder, [hidden_mean, hidden_log_sigma], full_concat
예제 #4
0
def get_encoder(input_idx, input_one_hot_embeddings, nfilter, z_size,
                intermediate_dim):
    # oshape = (batch_size, sample_size/2, 128)
    conv1 = Conv1D(filters=nfilter, kernel_size=3, strides=2,
                   padding='same')(input_one_hot_embeddings)
    bn1 = BatchNormalization()(conv1)
    relu1 = Activation('relu')(bn1)
    # oshape = (batch_size, sample_size/4, 128)
    conv2 = Conv1D(filters=2 * nfilter,
                   kernel_size=3,
                   strides=2,
                   padding='same')(relu1)
    bn2 = BatchNormalization()(conv2)
    relu2 = Activation('relu')(bn2)
    conv3 = Conv1D(
        filters=2 * nfilter,
        kernel_size=3,
        strides=2,
        padding='same',
    )(relu2)
    bn3 = BatchNormalization()(conv3)
    relu3 = Activation('relu')(bn3)
    # oshape = (batch_size, sample_size/4*256)
    flatten = Flatten()(relu3)
    # need to store the size of the representation after the convolutions -> needed for deconv later
    hidden_intermediate_enc = Dense(intermediate_dim,
                                    name='intermediate_encoding')(flatten)
    hidden_mean = Dense(z_size, name='mu')(hidden_intermediate_enc)
    hidden_log_sigma = Dense(z_size, name='sigma')(hidden_intermediate_enc)

    sampling_object = Sampling(z_size)
    sampling = sampling_object([hidden_mean, hidden_log_sigma])

    encoder = Model(inputs=input_idx,
                    outputs=[sampling, hidden_mean, hidden_log_sigma])

    return encoder
예제 #5
0
def vae_model(config_data, vocab, step):
    z_size = config_data['z_size']
    sample_size = config_data['max_sentence_length']
    nclasses = len(vocab) + 2
    #last available index is reserved as start character
    start_word_idx = nclasses - 1
    lstm_size = config_data['lstm_size']
    alpha = config_data['alpha']
    intermediate_dim = config_data['intermediate_dim']
    nfilter = 128
    out_size = 200
    eps = 0.001
    anneal_start = 1000.0
    anneal_end = anneal_start + 7000.0
    # == == == == == =
    # Define Encoder
    # == == == == == =
    input_idx = Input(batch_shape=(None, sample_size),
                      dtype='float32',
                      name='character_input')

    one_hot_weights = np.identity(nclasses)
    #oshape = (batch_size, sample_size, nclasses)
    one_hot_embeddings = Embedding(input_length=sample_size,
                                   input_dim=nclasses,
                                   output_dim=nclasses,
                                   weights=[one_hot_weights],
                                   trainable=False,
                                   name='one_hot_embeddings')

    input_one_hot_embeddings = one_hot_embeddings((input_idx))
    #oshape = (batch_size, sample_size/2, 128)
    conv1 = Conv1D(filters=nfilter, kernel_size=3, strides=2,
                   padding='same')(input_one_hot_embeddings)
    bn1 = BatchNormalization()(conv1)
    relu1 = Activation(activation='relu')(bn1)
    # oshape = (batch_size, sample_size/4, 128)
    conv2 = Conv1D(filters=2 * nfilter,
                   kernel_size=3,
                   strides=2,
                   padding='same')(relu1)
    bn2 = BatchNormalization()(conv2)
    relu2 = Activation(activation='relu')(bn2)
    #oshape = (batch_size, sample_size/4*256)
    flatten = Flatten()(relu2)
    #need to store the size of the representation after the convolutions -> needed for deconv later
    hidden_intermediate_enc = Dense(intermediate_dim,
                                    activation='relu',
                                    name='intermediate_encoding')(flatten)
    hidden_zvalues = Dense(z_size * 2)(hidden_intermediate_enc)
    sampling_object = Sampling(z_size)
    sampling = sampling_object(hidden_zvalues)

    # == == == == == =
    # Define Decoder
    # == == == == == =
    hidden_intermediate_dec = Dense(intermediate_dim,
                                    name='intermediate_decoding')(sampling)
    decoder_upsample = Dense(int(2 * nfilter * sample_size /
                                 4))(hidden_intermediate_dec)
    if K.image_data_format() == 'channels_first':
        output_shape = (2 * nfilter, int(sample_size / 4), 1)
    else:
        output_shape = (int(sample_size / 4), 1, 2 * nfilter)
    reshape = Reshape(output_shape)(decoder_upsample)
    #shape = (batch_size, filters)
    deconv1 = Conv2DTranspose(filters=nfilter,
                              kernel_size=(3, 1),
                              strides=(2, 1),
                              padding='same')(reshape)
    bn3 = BatchNormalization()(deconv1)
    relu3 = Activation(activation='relu')(bn3)
    deconv2 = Conv2DTranspose(filters=out_size,
                              kernel_size=(3, 1),
                              strides=(2, 1),
                              padding='same')(relu3)
    bn4 = BatchNormalization()(deconv2)
    relu4 = Activation(activation='relu')(bn4)
    reshape = Reshape((sample_size, out_size))(relu4)
    softmax = Dense(nclasses, activation='softmax')(reshape)

    def argmax_fun(softmax_output):
        return K.argmax(softmax_output, axis=2)

    def vae_loss(args):
        x, x_decoded_mean = args
        # NOTE: binary_crossentropy expects a batch_size by dim
        # for x and x_decoded_mean, so we MUST flatten these!
        x = K.flatten(K.clip(x, 1e-5, 1 - 1e-5))
        x_decoded_mean = K.flatten(x_decoded_mean)
        xent_loss = nclasses * sample_size * binary_crossentropy(
            x, x_decoded_mean)
        kl_loss = -0.5 * K.mean(1 + sampling_object.log_sigma - K.square(
            sampling_object.mu) - K.exp(sampling_object.log_sigma),
                                axis=-1)
        kld_weight = K.clip((step - anneal_start) /
                            (anneal_end - anneal_start), 0, 1 - eps) + eps
        return xent_loss + kl_loss * kld_weight

    def identity_loss(y_true, y_pred):
        return y_pred

    loss = Lambda(vae_loss,
                  output_shape=(1, ))([input_one_hot_embeddings, softmax])

    argmax = Lambda(argmax_fun, output_shape=(sample_size, ))(softmax)

    train_model = Model(inputs=[input_idx], outputs=[loss])

    test_model = Model(inputs=[input_idx], outputs=[argmax])

    return train_model, test_model
def vae_model(config_data, vocab, step):
    z_size = config_data['z_size']
    sample_in_size = config_data['max_input_length']
    sample_out_size = config_data['max_output_length']
    nclasses = len(vocab) + 2
    #last available index is reserved as start character
    intermediate_dim = config_data['intermediate_dim']
    nfilter = 128
    out_size = 200
    eps = 0.001
    anneal_start = 1000.0
    anneal_end = anneal_start + 7000.0
    # == == == == == =
    # Define Encoder
    # == == == == == =
    input_idx = Input(batch_shape=(None, sample_in_size),
                      dtype='float32',
                      name='character_input')
    output_idx = Input(batch_shape=(None, sample_out_size),
                       dtype='int32',
                       name='character_output')

    one_hot_weights = np.identity(nclasses)
    #oshape = (batch_size, sample_size, nclasses)
    one_hot_embeddings = Embedding(input_length=sample_in_size,
                                   input_dim=nclasses,
                                   output_dim=nclasses,
                                   weights=[one_hot_weights],
                                   trainable=False,
                                   name='one_hot_embeddings')

    input_one_hot_embeddings = one_hot_embeddings((input_idx))
    #oshape = (batch_size, sample_size/2, 128)
    conv1 = Conv1D(filters=nfilter, kernel_size=3, strides=2,
                   padding='same')(input_one_hot_embeddings)
    bn1 = BatchNormalization()(conv1)
    relu1 = Activation(activation='relu')(bn1)
    # oshape = (batch_size, sample_size/4, 128)
    conv2 = Conv1D(filters=2 * nfilter,
                   kernel_size=3,
                   strides=2,
                   padding='same')(relu1)
    bn2 = BatchNormalization()(conv2)
    relu2 = Activation(activation='relu')(bn2)
    #oshape = (batch_size, sample_size/4*256)
    flatten = Flatten()(relu2)
    #need to store the size of the representation after the convolutions -> needed for deconv later
    hidden_intermediate_enc = Dense(intermediate_dim,
                                    activation='relu',
                                    name='intermediate_encoding')(flatten)
    hidden_mean = Dense(z_size, name='mu')(hidden_intermediate_enc)
    hidden_log_sigma = Dense(z_size, name='sigma')(hidden_intermediate_enc)

    sampling_object = Sampling(z_size)
    sampling = sampling_object([hidden_mean, hidden_log_sigma])

    # == == == == == =
    # Define Decoder
    # == == == == == =
    hidden_intermediate_dec = Dense(intermediate_dim,
                                    name='intermediate_decoding')(sampling)
    decoder_upsample = Dense(int(2 * nfilter * sample_out_size /
                                 4))(hidden_intermediate_dec)
    if K.image_data_format() == 'channels_first':
        output_shape = (2 * nfilter, int(sample_out_size / 4), 1)
    else:
        output_shape = (int(sample_out_size / 4), 1, 2 * nfilter)
    reshape = Reshape(output_shape)(decoder_upsample)
    #shape = (batch_size, filters)
    deconv1 = Conv2DTranspose(filters=nfilter,
                              kernel_size=(3, 1),
                              strides=(2, 1),
                              padding='same')(reshape)
    bn3 = BatchNormalization()(deconv1)
    relu3 = Activation(activation='relu')(bn3)
    deconv2 = Conv2DTranspose(filters=out_size,
                              kernel_size=(3, 1),
                              strides=(2, 1),
                              padding='same')(relu3)
    bn4 = BatchNormalization()(deconv2)
    relu4 = Activation(activation='relu')(bn4)
    reshape = Reshape((sample_out_size, out_size))(relu4)
    softmax = Dense(nclasses, activation='softmax')(reshape)

    def argmax_fun(softmax_output):
        return K.argmax(softmax_output, axis=2)

    def vae_loss(args):
        x_truth, x_decoded_final = args
        x_truth_flatten = K.flatten(x_truth)
        x_decoded_flat = K.reshape(x_decoded_final,
                                   shape=(-1, K.shape(x_decoded_final)[-1]))
        cross_ent = T.nnet.categorical_crossentropy(x_decoded_flat,
                                                    x_truth_flatten)
        cross_ent = K.reshape(cross_ent, shape=(-1, K.shape(x_truth)[1]))
        sum_over_sentences = K.sum(cross_ent, axis=1)
        return sum_over_sentences

    def vae_kld_loss(args):
        mu, log_sigma = args

        kl_loss = -0.5 * K.sum(1 + log_sigma - K.square(mu) - K.exp(log_sigma),
                               axis=-1)
        kld_weight = K.clip((step - anneal_start) /
                            (anneal_end - anneal_start), 0, 1 - eps) + eps
        return kl_loss * kld_weight

    def identity_loss(y_true, y_pred):
        return y_pred

    loss = Lambda(vae_loss, output_shape=(1, ))([output_idx, softmax])
    kld_loss = Lambda(vae_kld_loss, output_shape=(1, ),
                      name='kld_loss')([hidden_mean, hidden_log_sigma])

    argmax = Lambda(argmax_fun, output_shape=(sample_out_size, ))(softmax)

    train_model = Model(inputs=[input_idx, output_idx],
                        outputs=[loss, kld_loss])

    test_model = Model(inputs=[input_idx], outputs=[argmax])

    return train_model, test_model
def vae_model(config_data, vocab, step):
    z_size = config_data['z_size']
    sample_size = config_data['max_sentence_length']
    nclasses = len(vocab) + 2
    #last available index is reserved as start character
    start_word_idx = nclasses - 1
    lstm_size = config_data['lstm_size']
    alpha = config_data['alpha']
    intermediate_dim = config_data['intermediate_dim']
    nfilter = 128
    out_size = 200
    eps = 0.001
    anneal_start = 0.0
    anneal_end = anneal_start + 7000.0

    embedding_path = join(config_data['vocab_path'], 'embedding_matrix.npy')
    embedding_matrix = np.load(open(embedding_path, 'rb'))
    nclasses = embedding_matrix.shape[0]
    emb_dim = embedding_matrix.shape[1]
    # == == == == == =
    # Define Encoder
    # == == == == == =
    input_idx = Input(batch_shape=(None, sample_size),
                      dtype='int32',
                      name='character_input')

    #one_hot_weights = np.identity(nclasses)
    #oshape = (batch_size, sample_size, nclasses)
    word_embedding_layer = Embedding(input_length=sample_size,
                                     input_dim=nclasses,
                                     output_dim=emb_dim,
                                     weights=[embedding_matrix],
                                     trainable=False,
                                     name='word_embeddings')

    input_word_embeddings = word_embedding_layer((input_idx))
    #oshape = (batch_size, sample_size/2, 128)
    conv1 = Conv1D(filters=nfilter, kernel_size=3, strides=2,
                   padding='same')(input_word_embeddings)
    bn1 = BatchNormalization()(conv1)
    relu1 = Activation(activation='relu')(bn1)
    # oshape = (batch_size, sample_size/4, 128)
    conv2 = Conv1D(filters=2 * nfilter,
                   kernel_size=3,
                   strides=2,
                   padding='same')(relu1)
    bn2 = BatchNormalization()(conv2)
    relu2 = Activation(activation='relu')(bn2)
    #oshape = (batch_size, sample_size/4*256)
    flatten = Flatten()(relu2)
    #need to store the size of the representation after the convolutions -> needed for deconv later
    hidden_intermediate_enc = Dense(intermediate_dim,
                                    activation='relu',
                                    name='intermediate_encoding')(flatten)
    hidden_zvalues = Dense(z_size * 2)(hidden_intermediate_enc)
    sampling_object = Sampling(z_size)
    sampling = sampling_object(hidden_zvalues)

    # == == == == == =
    # Define Decoder
    # == == == == == =
    hidden_intermediate_dec = Dense(intermediate_dim,
                                    name='intermediate_decoding')(sampling)
    decoder_upsample = Dense(int(2 * nfilter * sample_size /
                                 4))(hidden_intermediate_dec)
    if K.image_data_format() == 'channels_first':
        output_shape = (2 * nfilter, int(sample_size / 4), 1)
    else:
        output_shape = (int(sample_size / 4), 1, 2 * nfilter)
    reshape = Reshape(output_shape)(decoder_upsample)
    #shape = (batch_size, filters)
    deconv1 = Conv2DTranspose(filters=nfilter,
                              kernel_size=(3, 1),
                              strides=(2, 1),
                              padding='same')(reshape)
    bn3 = BatchNormalization()(deconv1)
    relu3 = Activation(activation='relu')(bn3)
    deconv2 = Conv2DTranspose(filters=out_size,
                              kernel_size=(3, 1),
                              strides=(2, 1),
                              padding='same')(relu3)
    bn4 = BatchNormalization()(deconv2)
    relu4 = Activation(activation='relu')(bn4)
    reshape = Reshape((sample_size, out_size))(relu4)
    hidden = Dense(out_size, activation='linear')(reshape)
    hidden = Dense(out_size, activation='linear')(hidden)
    hidden = Dense(out_size, activation='linear')(hidden)

    def vae_cosine_distance_loss(args):
        x_truth, x_decoded_final = args

        #normalize over embedding-dimension
        xt_mag = K.l2_normalize(x_truth, axis=2)  #None, 40, 200
        xp_mag = K.l2_normalize(x_decoded_final, axis=2)  #None, 40, 200

        elem_mult = xt_mag * xp_mag
        cosine_sim = K.sum(elem_mult, axis=2)  #None, 40

        cosine_distance = 1 - cosine_sim  #size = None, 40

        sum_over_sentences = K.sum(cosine_distance, axis=1)  #None
        return sum_over_sentences

    def vae_mse_loss(args):
        x_truth, x_decoded_final = args

        difference = x_truth - x_decoded_final
        squared_difference = K.square(difference)
        sums = K.sum(K.sum(squared_difference, axis=2), axis=1)
        return sums

    def vae_kld_loss(args):
        kl_loss = -0.5 * K.sum(1 + sampling_object.log_sigma - K.square(
            sampling_object.mu) - K.exp(sampling_object.log_sigma),
                               axis=-1)
        kld_weight = K.clip((step - anneal_start) /
                            (anneal_end - anneal_start), 0, 1 - eps) + eps
        return kl_loss * kld_weight

    main_loss = Lambda(vae_cosine_distance_loss,
                       output_shape=(1, ),
                       name='main_loss')([input_word_embeddings, hidden])
    kld_loss = Lambda(vae_kld_loss, output_shape=(1, ),
                      name='kld_loss')([input_word_embeddings])

    prediction = PredictionLayer(word_embedding_layer, sample_size,
                                 nclasses)(hidden)

    train_model = Model(inputs=[input_idx], outputs=[main_loss, kld_loss])

    test_model = Model(inputs=[input_idx], outputs=[prediction])

    return train_model, test_model
def vae_model(config_data, vocab, step):
    z_size = config_data['z_size']
    sample_size = config_data['max_sentence_length']
    lstm_size = config_data['lstm_size']
    alpha = config_data['alpha']
    intermediate_dim = config_data['intermediate_dim']
    nfilter = 128
    out_size = 200
    eps = 0.001
    anneal_start = 200000.0
    anneal_end = anneal_start + 200000.0

    embedding_path = join(config_data['vocab_path'], 'embedding_matrix.npy')
    embedding_matrix = np.load(open(embedding_path, 'rb'))
    nclasses = embedding_matrix.shape[0]
    emb_dim = embedding_matrix.shape[1]

    l2_regularizer = None
    # == == == == == =
    # Define Encoder
    # == == == == == =
    input_idx = Input(batch_shape=(None, sample_size), dtype='int32', name='word_input')

    #one_hot_weights = np.identity(nclasses)
    #oshape = (batch_size, sample_size, nclasses)
    one_hot_embeddings = Embedding(
        input_length=sample_size,
        input_dim=nclasses,
        output_dim=emb_dim,
        weights=[embedding_matrix],
        trainable=True,
        name='word_embeddings'
    )

    input_one_hot_embeddings = one_hot_embeddings(input_idx)
    #oshape = (batch_size, sample_size/2, 128)
    conv1 = Conv1D(
        filters=nfilter,
        kernel_size=3,
        strides=2,
        padding='same',
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )(input_one_hot_embeddings)
    bn1 = BatchNormalization(
        beta_regularizer=l2_regularizer,
        gamma_regularizer=l2_regularizer
    )(conv1)
    relu1 = Activation(activation='relu')(bn1)
    # oshape = (batch_size, sample_size/4, 128)
    conv2 = Conv1D(
        filters=2*nfilter,
        kernel_size=3,
        strides=2,
        padding='same',
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )(relu1)
    bn2 = BatchNormalization(
        beta_regularizer=l2_regularizer,
        gamma_regularizer=l2_regularizer
    )(conv2)
    relu2 = Activation(activation='relu')(bn2)
    #oshape = (batch_size, sample_size/4*256)
    flatten = Flatten()(relu2)
    #need to store the size of the representation after the convolutions -> needed for deconv later
    hidden_intermediate_enc = Dense(
        intermediate_dim,
        activation='relu',
        name='intermediate_encoding',
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )(flatten)
    hidden_zvalues = Dense(z_size*2)(hidden_intermediate_enc)
    sampling_object = Sampling(z_size)
    sampling = sampling_object(hidden_zvalues)

    # == == == == == == == =
    # Define Decoder Layers
    # == == == == == == == =
    decoder_input_layer = Dense(
        intermediate_dim,
        name='intermediate_decoding',
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )
    hidden_intermediate_dec = decoder_input_layer(sampling)
    decoder_upsample = Dense(
        int(2*nfilter*sample_size/4),
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )(hidden_intermediate_dec)
    if K.image_data_format() == 'channels_first':
        output_shape = (2*nfilter, int(sample_size/4), 1)
    else:
        output_shape = (int(sample_size/4), 1, 2*nfilter)
    reshape = Reshape(output_shape)(decoder_upsample)
    #shape = (batch_size, filters)
    deconv1 = Conv2DTranspose(
        filters=nfilter,
        kernel_size=(3, 1),
        strides=(2, 1),
        padding='same',
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )(reshape)
    bn3 = BatchNormalization(
        beta_regularizer=l2_regularizer,
        gamma_regularizer=l2_regularizer
    )(deconv1)
    relu3 = Activation(activation='relu')(bn3)
    deconv2 = Conv2DTranspose(
        filters=out_size,
        kernel_size=(3, 1),
        strides=(2, 1),
        padding='same',
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )(relu3)
    bn4 = BatchNormalization(
        beta_regularizer=l2_regularizer,
        gamma_regularizer=l2_regularizer
    )(deconv2)
    relu4 = Activation(activation='relu')(bn4)
    reshape = Reshape((sample_size, out_size))(relu4)
    hidden = Dense(out_size, activation='linear')(reshape)
    hidden = Dense(out_size, activation='linear')(hidden)
    hidden_auxiliary = Dense(out_size, activation='linear', name='auxiliary_output')(hidden)

    def argmax_fun(softmax_output):
        return K.argmax(softmax_output, axis=2)

    def remove_last_column(x):
        return x[:, :-1, :]

    padding = ZeroPadding1D(padding=(1, 0))(input_one_hot_embeddings)
    previous_char_slice = Lambda(remove_last_column, output_shape=(sample_size, out_size))(padding)

    combined_input = concatenate(inputs=[hidden_auxiliary, previous_char_slice], axis=2)
    #MUST BE IMPLEMENTATION 1 or 2
    lstm = LSTM(
        200,
        return_sequences=True,
        implementation=2,
        kernel_regularizer=l2_regularizer,
        bias_regularizer=l2_regularizer,
        recurrent_regularizer=l2_regularizer,
        activity_regularizer=l2_regularizer
    )
    recurrent_component = lstm(combined_input)
    hidden_0 = Dense(out_size, activation='linear')
    hidden_1 = Dense(out_size, activation='linear')
    hidden_final = Dense(out_size, activation='linear', name='final_output')

    hidden_0_inst = hidden_0(recurrent_component)
    hidden_1_inst = hidden_1(hidden_0_inst)
    softmax_final = hidden_final(hidden_1_inst)

    def vae_cosine_distance_loss(args):
        x_truth, x_decoded_final = args

        #normalize over embedding-dimension
        xt_mag = K.l2_normalize(x_truth, axis=2) #None, 40, 200
        xp_mag = K.l2_normalize(x_decoded_final, axis=2)#None, 40, 200

        elem_mult = xt_mag*xp_mag
        cosine_sim = K.sum(elem_mult, axis=2) #None, 40

        cosine_distance = 1 - cosine_sim #size = None, 40

        sum_over_sentences = K.sum(cosine_distance, axis=1)#None
        return sum_over_sentences

    def vae_kld_loss(args):
        kl_loss = - 0.5 * K.sum(1 + sampling_object.log_sigma - K.square(sampling_object.mu) - K.exp(sampling_object.log_sigma), axis=-1)
        kld_weight = K.clip((step - anneal_start) / (anneal_end - anneal_start), 0, 1 - eps) + eps
        return kl_loss*kld_weight


    def identity_loss(y_true, y_pred):
        return y_pred

    main_loss = Lambda(vae_cosine_distance_loss, output_shape=(1,), name='main_loss')([input_one_hot_embeddings, softmax_final])
    kld_loss = Lambda(vae_kld_loss, output_shape=(1,), name='kld_loss')([input_one_hot_embeddings, softmax_final, hidden_auxiliary])
    aux_loss = Lambda(vae_cosine_distance_loss, output_shape=(1,), name='auxiliary_loss')([input_one_hot_embeddings, hidden_auxiliary])

    output_gen_layer = LSTMStep(lstm, one_hot_embeddings, [hidden_0, hidden_1, hidden_final], sample_size, nclasses)(hidden_auxiliary)

    train_model = Model(inputs=[input_idx], outputs=[main_loss, kld_loss, aux_loss])

    test_model = Model(inputs=[input_idx], outputs=[output_gen_layer])

    return train_model, test_model