Exemplo n.º 1
0
def build_sampler(P, character_count, embedding_size=20, hidden_size=50):
    P.V = np.random.randn(character_count, embedding_size)
    P.init_recurrent_1_hidden = 0.5 * np.random.randn(hidden_size)
    P.init_recurrent_1_cell = 0.5 * np.random.randn(hidden_size)
    P.init_recurrent_2_hidden = 0.5 * np.random.randn(hidden_size)
    P.init_recurrent_2_cell = 0.5 * np.random.randn(hidden_size)

    lstm_layer_1 = lstm.build_step(P,
                                   name="recurrent_1",
                                   input_size=embedding_size,
                                   hidden_size=hidden_size)

    lstm_layer_2 = lstm.build_step(P,
                                   name="recurrent_2",
                                   input_size=hidden_size,
                                   hidden_size=hidden_size)
    P.W_output = np.zeros((hidden_size, character_count))
    P.b_output = np.zeros((character_count, ))

    def sampler(temp, x, prev_cell_1, prev_hidden_1, prev_cell_2,
                prev_hidden_2):
        input_embedding = P.V[x]
        cell_1, hidden_1 = lstm_layer_1(input_embedding, prev_cell_1,
                                        prev_hidden_1)
        cell_2, hidden_2 = lstm_layer_2(hidden_1, prev_cell_2, prev_hidden_2)
        output = U.vector_softmax(temp *
                                  (T.dot(hidden_2, P.W_output) + P.b_output))
        return output, cell_1, hidden_1, cell_2, hidden_2

    return sampler
Exemplo n.º 2
0
def build_sampler(P, character_count, embedding_size=20, hidden_size=50):
    P.V = np.random.randn(character_count, embedding_size)
    P.init_recurrent_1_hidden = 0.5 * np.random.randn(hidden_size)
    P.init_recurrent_1_cell = 0.5 * np.random.randn(hidden_size)
    P.init_recurrent_2_hidden = 0.5 * np.random.randn(hidden_size)
    P.init_recurrent_2_cell = 0.5 * np.random.randn(hidden_size)

    lstm_layer_1 = lstm.build_step(P,
                                   name="recurrent_1",
                                   input_size=embedding_size,
                                   hidden_size=hidden_size
                                   )

    lstm_layer_2 = lstm.build_step(P,
                                   name="recurrent_2",
                                   input_size=hidden_size,
                                   hidden_size=hidden_size
                                   )
    P.W_output = np.zeros((hidden_size, character_count))
    P.b_output = np.zeros((character_count,))

    def sampler(temp, x, prev_cell_1, prev_hidden_1, prev_cell_2, prev_hidden_2):
        input_embedding = P.V[x]
        cell_1, hidden_1 = lstm_layer_1(input_embedding, prev_cell_1, prev_hidden_1)
        cell_2, hidden_2 = lstm_layer_2(hidden_1, prev_cell_2, prev_hidden_2)
        output = U.vector_softmax(temp * (T.dot(hidden_2, P.W_output) + P.b_output))
        return output, cell_1, hidden_1, cell_2, hidden_2
    return sampler
Exemplo n.º 3
0
def build_encoder(P, input_size, hidden_size, latent_size):

    P.init_encoder_hidden = np.zeros((hidden_size, ))
    P.init_encoder_cell = np.zeros((hidden_size, ))

    P.w_encoder_v = np.zeros((hidden_size, ))
    P.b_encoder_v = 0

    rnn_step = lstm.build_step(P,
                               name="encoder",
                               input_sizes=[input_size, latent_size],
                               hidden_size=hidden_size)

    gaussian_out = vae.build_encoder_output(P,
                                            name="encoder_gaussian",
                                            input_size=hidden_size,
                                            output_size=latent_size,
                                            initialise_weights=None)

    def encode(X, step_count):
        init_hidden = T.tanh(P.init_encoder_hidden)
        init_cell = P.init_encoder_cell
        init_hidden_batch = T.alloc(init_hidden, X.shape[0], hidden_size)
        init_cell_batch = T.alloc(init_cell, X.shape[0], hidden_size)
        init_latent = U.theano_rng.normal(size=(X.shape[0], latent_size))
        init_z_mean = T.zeros_like(init_latent)
        init_z_std = T.ones_like(init_latent)
        eps_seq = U.theano_rng.normal(size=(step_count, X.shape[0],
                                            latent_size))

        def step(eps, prev_latent, prev_hidden, prev_cell, prev_z_mean,
                 prev_z_std):
            hidden, cell = rnn_step(X, prev_latent, prev_hidden, prev_cell)
            _, curr_z_mean, curr_z_std = gaussian_out(hidden)
            z_mean = curr_z_mean
            z_std = curr_z_std
            z_sample = z_mean + eps * z_std
            return z_sample, hidden, cell, z_mean, z_std

        [z_samples, hiddens, cells, z_means,
         z_stds], _ = theano.scan(step,
                                  sequences=[eps_seq],
                                  outputs_info=[
                                      init_latent, init_hidden_batch,
                                      init_cell_batch, init_z_mean, init_z_std
                                  ])

        alphas = T.exp(T.dot(hiddens, P.w_encoder_v) + P.b_encoder_v + 5)
        return z_samples, z_means, z_stds, alphas

    return encode
Exemplo n.º 4
0
def build(P,
		word_rep_size,
		stmt_hidden_size,
		diag_hidden_size,
		vocab_size,
		output_size,
		map_fun_size,
		evidence_count
	):

	vocab_vectors = 0.001 * random_init(vocab_size,word_rep_size)
	P.vocab = vocab_vectors
	V = P.vocab

	encode_qstn = encode_stmt = build_stmt_encoder(P,"stmt",word_rep_size,stmt_hidden_size)
	#encode_qstn = build_stmt_encoder(P,"qstn",word_rep_size,diag_hidden_size)
	encode_diag = build_diag_encoder(P,
			stmt_size   = stmt_hidden_size,
			hidden_size = diag_hidden_size,
			output_size = diag_hidden_size,
			encode_stmt = encode_stmt
		)

	qn2keys = lstm.build_step(P,"qn2keys",
				input_size  = diag_hidden_size,
				hidden_size = diag_hidden_size
			)

	lookup_prep = build_lookup(P,
			data_size = diag_hidden_size,
			state_size = diag_hidden_size
		)

#	diag2output = feedforward.build(P,"diag2output",
#				input_sizes  = [diag_hidden_size],
#				hidden_sizes = [map_fun_size],
#				output_size  = vocab_size
#			)
	P.W_output_vocab = 0.01 * random_init(diag_hidden_size,vocab_size)
	P.b_output_vocab = 0.00 * np.zeros((vocab_size,))


	def qa(story,idxs,qstn):
		word_feats    = V[story]
		qn_word_feats = V[qstn]

		diag_cells,diag_hiddens = encode_diag(word_feats,idxs)
		qn_cell,qn_hidden = encode_qstn(qn_word_feats)
		
		lookup = lookup_prep(diag_hiddens)

		attention = [None] * evidence_count
		evidence  = [None] * evidence_count


		prev_cell,prev_hidden = qn_cell,qn_hidden
		prev_attn = 0
		alpha = 0.0
		input_vec = T.mean(diag_cells,axis=0)
		for i in xrange(evidence_count): 
			prev_cell, prev_hidden = qn2keys(input_vec,prev_cell,prev_hidden)
			attention[i] = lookup(prev_hidden,prev_attn)
			attention[i].name = "attention_%d"%i
			evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0,'x') * diag_cells,axis=0)
								#	alpha * T.mean(diag_vectors,axis=0)
			prev_attn = prev_attn + attention[i]
		final_cell, final_hidden = prev_cell,prev_hidden

		output = U.vector_softmax(T.dot(final_hidden,P.W_output_vocab) + P.b_output_vocab)
		return attention,output
	return qa
Exemplo n.º 5
0
def build(P, word_rep_size, stmt_hidden_size, diag_hidden_size, vocab_size,
          output_size, map_fun_size, evidence_count):

    vocab_vectors = 0.001 * random_init(vocab_size, word_rep_size)
    P.vocab = vocab_vectors
    V = P.vocab

    encode_qstn = encode_stmt = build_stmt_encoder(P, "stmt", word_rep_size,
                                                   stmt_hidden_size)
    #encode_qstn = build_stmt_encoder(P,"qstn",word_rep_size,diag_hidden_size)
    encode_diag = build_diag_encoder(P,
                                     stmt_size=stmt_hidden_size,
                                     hidden_size=diag_hidden_size,
                                     output_size=diag_hidden_size,
                                     encode_stmt=encode_stmt)

    qn2keys = lstm.build_step(P,
                              "qn2keys",
                              input_size=diag_hidden_size,
                              hidden_size=diag_hidden_size)

    lookup_prep = build_lookup(P,
                               data_size=diag_hidden_size,
                               state_size=diag_hidden_size)

    #	diag2output = feedforward.build(P,"diag2output",
    #				input_sizes  = [diag_hidden_size],
    #				hidden_sizes = [map_fun_size],
    #				output_size  = vocab_size
    #			)
    P.W_output_vocab = 0.01 * random_init(diag_hidden_size, vocab_size)
    P.b_output_vocab = 0.00 * np.zeros((vocab_size, ))

    def qa(story, idxs, qstn):
        word_feats = V[story]
        qn_word_feats = V[qstn]

        diag_cells, diag_hiddens = encode_diag(word_feats, idxs)
        qn_cell, qn_hidden = encode_qstn(qn_word_feats)

        lookup = lookup_prep(diag_hiddens)

        attention = [None] * evidence_count
        evidence = [None] * evidence_count

        prev_cell, prev_hidden = qn_cell, qn_hidden
        prev_attn = 0
        alpha = 0.0
        input_vec = T.mean(diag_cells, axis=0)
        for i in xrange(evidence_count):
            prev_cell, prev_hidden = qn2keys(input_vec, prev_cell, prev_hidden)
            attention[i] = lookup(prev_hidden, prev_attn)
            attention[i].name = "attention_%d" % i
            evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0, 'x') *
                                            diag_cells,
                                            axis=0)
            #	alpha * T.mean(diag_vectors,axis=0)
            prev_attn = prev_attn + attention[i]
        final_cell, final_hidden = prev_cell, prev_hidden

        output = U.vector_softmax(
            T.dot(final_hidden, P.W_output_vocab) + P.b_output_vocab)
        return attention, output

    return qa
Exemplo n.º 6
0
def build(P, input_size, embedding_size, controller_size, stack_size,
          output_size):
    softmax_output_size = output_size + 1
    P.embeddings = np.random.randn(input_size + 2,
                                   embedding_size).astype(np.float64)
    controller_step = lstm.build_step(P,
                                      name="controller",
                                      input_size=embedding_size + stack_size,
                                      hidden_size=controller_size)
    stack_init = stack.build(size=stack_size)

    P.W_controller_output = 0.1 * np.random.randn(
        controller_size, softmax_output_size + stack_size + 1 + 1).astype(
            np.float64)
    bias = np.zeros((softmax_output_size + stack_size + 1 + 1, ),
                    dtype=np.float64)
    bias[-2] = 5
    bias[-1] = -5
    P.b_controller_output = bias

    init_controller_cell = np.zeros((controller_size, ), dtype=np.float64)
    init_controller_hidden = np.zeros((controller_size, ), dtype=np.float64)
    init_stack_r = np.zeros((stack_size, ), dtype=np.float64)

    def predict(ids, aux={}):
        X = P.embeddings[ids]
        init_stack_V, init_stack_s, stack_step = stack_init(X.shape[0])

        def step(x, t, prev_controller_cell, prev_controller_hidden, prev_V,
                 prev_s, prev_r):
            controller_input = T.concatenate([x, prev_r])
            controller_cell, controller_hidden = \
                controller_step(
                    x=controller_input,
                    prev_cell=prev_controller_cell,
                    prev_hidden=prev_controller_hidden
                )

            controller_output = T.dot(controller_hidden, P.W_controller_output) + \
                                P.b_controller_output

            output = controller_output[:softmax_output_size]
            v = T.tanh(
                controller_output[softmax_output_size:softmax_output_size +
                                  stack_size])
            flags = T.nnet.sigmoid(controller_output[-2:])

            V, s, r = stack_step(t=t,
                                 v=v,
                                 d=flags[0],
                                 u=flags[1],
                                 prev_V=prev_V,
                                 prev_s=prev_s)

            return controller_cell, controller_hidden, V, s, r, controller_output, output

        sequences, _ = theano.scan(step,
                                   sequences=[X, T.arange(X.shape[0])],
                                   outputs_info=[
                                       init_controller_cell,
                                       init_controller_hidden, init_stack_V,
                                       init_stack_s, init_stack_r, None, None
                                   ])

        outputs = T.nnet.softmax(sequences[-1])
        aux['controller_output'] = sequences[-2]

        return outputs

    return predict
Exemplo n.º 7
0
def build(P, name,
          input_size=200, z_size=200,
          hidden_layer_size=2500,
          x_extractor_layers=[600] * 4,
          z_extractor_layers=[500] * 4,
          prior_layers=[500] * 4,
          generation_layers=[600] * 4,
          inference_layers=[500] * 4):
    def weight_init(x,y):
        return np.random.uniform(-0.08, 0.08, (x,y))


    X_extractor = feedforward.build_classifier(
        P, "x_extractor",
        input_sizes=[input_size],
        hidden_sizes=x_extractor_layers[:-1],
        output_size=x_extractor_layers[-1],
        initial_weights=weight_init,
        output_initial_weights=weight_init,
        activation=T.nnet.relu,
        output_activation=T.nnet.relu
    )

    Z_extractor = feedforward.build_classifier(
        P, "z_extractor",
        input_sizes=[z_size],
        hidden_sizes=z_extractor_layers[:-1],
        output_size=z_extractor_layers[-1],
        initial_weights=weight_init,
        output_initial_weights=weight_init,
        activation=T.nnet.relu,
        output_activation=T.nnet.relu
    )

    prior = vae.build_inferer(
        P, "prior",
        input_sizes=[hidden_layer_size],
        hidden_sizes=prior_layers,
        output_size=z_size,
        initial_weights=weight_init,
        activation=T.nnet.relu,
        initialise_outputs=False
    )

    generate = vae.build_inferer(
        P, "generator",
        input_sizes=[hidden_layer_size, z_extractor_layers[-1]],
        hidden_sizes=generation_layers,
        output_size=input_size,
        initial_weights=weight_init,
        activation=T.nnet.relu,
        initialise_outputs=False
    )

    P.init_recurrence_hidden = np.zeros((hidden_layer_size,))
    P.init_recurrence_cell = np.zeros((hidden_layer_size,))
    recurrence = lstm.build_step(
        P, "recurrence",
        input_sizes=[x_extractor_layers[-1],z_extractor_layers[-1]],
        hidden_size=hidden_layer_size
    )

    infer = vae.build_inferer(
        P, "infer",
        input_sizes=[hidden_layer_size, x_extractor_layers[-1]],
        hidden_sizes=generation_layers,
        output_size=z_size,
        initial_weights=weight_init,
        activation=T.nnet.relu,
        initialise_outputs=False
    )

    def sample():
        init_hidden = T.tanh(P.init_recurrence_hidden)
        init_cell = P.init_recurrence_cell
        init_hidden_batch = T.alloc(init_hidden, 1, hidden_layer_size)
        init_cell_batch = T.alloc(init_cell, 1, hidden_layer_size)
        noise = U.theano_rng.normal(size=(40,1,z_size))

        def _step(eps, prev_cell, prev_hidden):
           _, z_prior_mean, z_prior_logvar = prior([prev_hidden])
           z_sample = z_prior_mean + eps * T.exp(0.5 * z_prior_logvar)
           z_feat = Z_extractor([z_sample])
           _, x_mean, _ = generate([prev_hidden, z_feat])
           x_feat = X_extractor([x_mean])
           curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell, prev_hidden)
           return curr_cell, curr_hidden, x_mean

        [cells,hiddens,x_means],_ = theano.scan(
                _step,
                sequences=[noise],
                outputs_info=[init_cell_batch,init_hidden_batch,None],
            )
        return x_means


    def extract(X,l):

        init_hidden = T.tanh(P.init_recurrence_hidden)
        init_cell = P.init_recurrence_cell
        init_hidden_batch = T.alloc(init_hidden, X.shape[1], hidden_layer_size)
        init_cell_batch = T.alloc(init_cell, X.shape[1], hidden_layer_size)
        noise = U.theano_rng.normal(size=(X.shape[0],X.shape[1],z_size))
        reset_init_mask = U.theano_rng.binomial(size=(X.shape[0],X.shape[1]),p=0.00)

        X_feat = X_extractor([X])

        def _step(t,x_feat, eps, reset_mask, prev_cell, prev_hidden):
            reset_mask = reset_mask.dimshuffle(0,'x')

            _, z_prior_mean, z_prior_logvar = prior([prev_hidden])
            _, z_mean, z_logvar = infer([prev_hidden, x_feat])
            z_sample = z_mean + eps * T.exp(0.5 * z_logvar)
            z_feat = Z_extractor([z_sample])
            _, x_mean, x_logvar = generate([prev_hidden, z_feat])

            curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell, prev_hidden)
            curr_cell = T.switch(
                    reset_mask, init_cell_batch, curr_cell)
            curr_hidden = T.switch(
                    reset_mask, init_hidden_batch, curr_hidden)

            mask = (t < l).dimshuffle(0,'x')
            return tuple(
                T.switch(mask,out,0)
                for out in (
                    curr_cell, curr_hidden,
                    z_prior_mean, z_prior_logvar,
                    z_sample, z_mean, z_logvar,
                    x_mean, x_logvar
                ))

        [_, _,
         Z_prior_mean, Z_prior_logvar,
         Z_sample, Z_mean, Z_logvar,
         X_mean, X_logvar], _ = theano.scan(
            _step,
            sequences=[T.arange(X_feat.shape[0]),X_feat,noise,reset_init_mask],
            outputs_info=[init_cell_batch, init_hidden_batch] +
            [None] * 7,
        )
        return [
            Z_prior_mean, Z_prior_logvar,
            Z_mean, Z_logvar,
            X_mean, X_logvar,
        ]
    return extract, sample
Exemplo n.º 8
0
def build(P, input_size, embedding_size, controller_size, stack_size, output_size):
    softmax_output_size = output_size + 1
    P.embeddings = np.random.randn(input_size + 2,embedding_size).astype(np.float32)
    controller_step = lstm.build_step(
        P, name="controller",
        input_size = embedding_size + stack_size,
        hidden_size = controller_size
    )
    stack_init = stack.build(size=stack_size)

    P.W_controller_output = 0.1 * np.random.randn(
        controller_size,
        softmax_output_size + stack_size + 1 + 1
    ).astype(np.float32)
    bias = np.zeros((softmax_output_size + stack_size + 1 + 1,), dtype=np.float32)
    bias[-2] = 5
    bias[-1] = -5
    P.b_controller_output = bias

    init_controller_cell   = np.zeros((controller_size,), dtype=np.float32)
    init_controller_hidden = np.zeros((controller_size,), dtype=np.float32)
    init_stack_r           = np.zeros((stack_size,), dtype=np.float32)

    def predict(ids,aux={}):
        X = P.embeddings[ids]
        init_stack_V, init_stack_s, stack_step = stack_init(X.shape[0])

        def step(x, t,
                 prev_controller_cell, prev_controller_hidden,
                 prev_V, prev_s, prev_r):

            controller_input = T.concatenate([x, prev_r])
            controller_cell, controller_hidden = \
                controller_step(
                    x=controller_input,
                    prev_cell=prev_controller_cell,
                    prev_hidden=prev_controller_hidden
                )

            controller_output = T.dot(controller_hidden, P.W_controller_output) +\
                P.b_controller_output

            output = controller_output[:softmax_output_size]
            v      = T.tanh(controller_output[
                        softmax_output_size:
                        softmax_output_size + stack_size
                    ])
            flags = T.nnet.sigmoid(controller_output[-2:])

            V, s, r = stack_step(
                t=t,
                v=v,
                d=flags[0],
                u=flags[1],
                prev_V=prev_V, prev_s=prev_s
            )

            return controller_cell, controller_hidden, V, s, r, controller_output, output

        sequences, _ = theano.scan(
            step,
            sequences=[X, T.arange(X.shape[0])],
            outputs_info=[
                init_controller_cell,
                init_controller_hidden,
                init_stack_V,
                init_stack_s,
                init_stack_r,
                None,
                None
            ]
        )

        outputs = T.nnet.softmax(sequences[-1])
        aux['controller_output'] = sequences[-2]

        return outputs
    return predict
Exemplo n.º 9
0
def build(P,
          name,
          input_size=200,
          z_size=200,
          hidden_layer_size=2500,
          x_extractor_layers=[600] * 4,
          z_extractor_layers=[500] * 4,
          prior_layers=[500] * 4,
          generation_layers=[600] * 4,
          inference_layers=[500] * 4):
    def weight_init(x, y):
        return np.random.uniform(-0.08, 0.08, (x, y))

    X_extractor = feedforward.build_classifier(
        P,
        "x_extractor",
        input_sizes=[input_size],
        hidden_sizes=x_extractor_layers[:-1],
        output_size=x_extractor_layers[-1],
        initial_weights=weight_init,
        output_initial_weights=weight_init,
        activation=T.nnet.relu,
        output_activation=T.nnet.relu)

    Z_extractor = feedforward.build_classifier(
        P,
        "z_extractor",
        input_sizes=[z_size],
        hidden_sizes=z_extractor_layers[:-1],
        output_size=z_extractor_layers[-1],
        initial_weights=weight_init,
        output_initial_weights=weight_init,
        activation=T.nnet.relu,
        output_activation=T.nnet.relu)

    prior = vae.build_inferer(P,
                              "prior",
                              input_sizes=[hidden_layer_size],
                              hidden_sizes=prior_layers,
                              output_size=z_size,
                              initial_weights=weight_init,
                              activation=T.nnet.relu,
                              initialise_outputs=True)

    generate = vae.build_inferer(
        P,
        "generator",
        input_sizes=[hidden_layer_size, z_extractor_layers[-1]],
        hidden_sizes=generation_layers,
        output_size=input_size,
        initial_weights=weight_init,
        activation=T.nnet.relu,
        initialise_outputs=True)

    P.init_recurrence_hidden = np.zeros((hidden_layer_size, ))
    P.init_recurrence_cell = np.zeros((hidden_layer_size, ))
    recurrence = lstm.build_step(
        P,
        "recurrence",
        input_sizes=[x_extractor_layers[-1], z_extractor_layers[-1]],
        hidden_size=hidden_layer_size)

    infer = vae.build_inferer(
        P,
        "infer",
        input_sizes=[hidden_layer_size, x_extractor_layers[-1]],
        hidden_sizes=generation_layers,
        output_size=z_size,
        initial_weights=weight_init,
        activation=T.nnet.relu,
        initialise_outputs=True)

    def sample():
        init_hidden = T.tanh(P.init_recurrence_hidden)
        init_cell = P.init_recurrence_cell
        init_hidden_batch = T.alloc(init_hidden, 1, hidden_layer_size)
        init_cell_batch = T.alloc(init_cell, 1, hidden_layer_size)
        noise = U.theano_rng.normal(size=(40, 1, z_size))

        def _step(eps, prev_cell, prev_hidden):
            _, z_prior_mean, z_prior_std = prior([prev_hidden])
            z_sample = z_prior_mean + eps * z_prior_std
            z_feat = Z_extractor([z_sample])
            _, x_mean, _ = generate([prev_hidden, z_feat])
            x_feat = X_extractor([x_mean])
            curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell,
                                                prev_hidden)
            return curr_cell, curr_hidden, x_mean

        [cells, hiddens, x_means], _ = theano.scan(
            _step,
            sequences=[noise],
            outputs_info=[init_cell_batch, init_hidden_batch, None],
        )
        return x_means

    def extract(X, l):

        init_hidden = T.tanh(P.init_recurrence_hidden)
        init_cell = P.init_recurrence_cell
        init_hidden_batch = T.alloc(init_hidden, X.shape[1], hidden_layer_size)
        init_cell_batch = T.alloc(init_cell, X.shape[1], hidden_layer_size)
        noise = U.theano_rng.normal(size=(X.shape[0], X.shape[1], z_size))
        reset_init_mask = U.theano_rng.binomial(size=(X.shape[0], X.shape[1]),
                                                p=0.025)

        X_feat = X_extractor([X])

        def _step(t, x_feat, eps, reset_mask, prev_cell, prev_hidden):
            reset_mask = reset_mask.dimshuffle(0, 'x')

            _, z_prior_mean, z_prior_std = prior([prev_hidden])
            _, z_mean, z_std = infer([prev_hidden, x_feat])
            z_sample = z_mean + eps * z_std
            z_feat = Z_extractor([z_sample])
            _, x_mean, x_std = generate([prev_hidden, z_feat])

            curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell,
                                                prev_hidden)
            curr_cell = T.switch(reset_mask, init_cell_batch, curr_cell)
            curr_hidden = T.switch(reset_mask, init_hidden_batch, curr_hidden)

            mask = (t < l).dimshuffle(0, 'x')
            return tuple(
                T.switch(mask, out, 0)
                for out in (curr_cell, curr_hidden, z_prior_mean, z_prior_std,
                            z_sample, z_mean, z_std, x_mean, x_std))

        [
            _, _, Z_prior_mean, Z_prior_std, Z_sample, Z_mean, Z_std, X_mean,
            X_std
        ], _ = theano.scan(
            _step,
            sequences=[
                T.arange(X_feat.shape[0]), X_feat, noise, reset_init_mask
            ],
            outputs_info=[init_cell_batch, init_hidden_batch] + [None] * 7,
        )
        return [
            Z_prior_mean,
            Z_prior_std,
            Z_mean,
            Z_std,
            X_mean,
            X_std,
        ]

    return extract, sample