Пример #1
0
	def create_training_function(self):
		updates, _, _, _, _ = create_optimization_updates(self.lstm_cost, self.params, method="SGD", lr=self.lstm_lr)
#		updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr)
		self.lstm_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.lstm_cost,
			updates=updates,
			allow_input_downcast=True)

		updates_turing = self.turing_updates(self.final_cost , lr=self.turing_lr)
#		updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr)
		self.turing_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_cost,
			updates=updates_turing,
                        mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
			allow_input_downcast=True)

		all_updates_lstm, _, _, _, _ = create_optimization_updates(self.final_cost, self.params, method="SGD", lr=self.all_lr,part=True)
		all_updates_turing_temp = self.turing_updates(self.final_cost , lr=self.all_lr)
                updates_all = all_updates_lstm
                for pair in all_updates_turing_temp :
                    updates_all[pair[0]] = pair[1]

		self.all_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_cost,
			updates=updates_all,
			allow_input_downcast=True)
Пример #2
0
    def _initialize_update_function(self):
        def time_step(input, *previous_hidden_state):
            return self.time_model.forward(input, prev_hiddens=previous_hidden_state)

        def note_step(input, *previous_hidden_state):
            return self.note_model.forward(input, prev_hiddens=previous_hidden_state)

        input = T.btensor4()
        adjusted_input = input[:, :-1]

        output = T.btensor4()
        adjusted_output = output[:, 1:]

        time_model_input = self.get_time_model_input(adjusted_input)
        time_model_outputs_info = self.get_outputs_info(time_model_input, self.time_model.layers)
        time_model_output = self.get_output(time_step, time_model_input, time_model_outputs_info)

        note_model_input = self.get_note_model_input(adjusted_input, adjusted_output, time_model_output)
        note_outputs_info = self.get_outputs_info(note_model_input, self.note_model.layers)
        note_model_output = self.get_output(note_step, note_model_input, note_outputs_info)

        prediction = self.get_prediction(adjusted_input, note_model_output)
        loss = self.get_loss(adjusted_output, prediction)

        updates, _, _, _, _ = create_optimization_updates(loss, self.params)

        self.update = theano.function(inputs=[input, output], outputs=loss, updates=updates, allow_input_downcast=True)
Пример #3
0
 def create_training_function(self):
     updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta")
     self.update_fun = theano.function(
         inputs=[self.input_mat, self.for_how_long],
         outputs=self.cost,
         updates=updates,
         allow_input_downcast=True)
Пример #4
0
 def create_training_function(self):
     updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta")
     self.update_fun = theano.function(
         inputs=[self.input_mat, self.for_how_long],
         outputs=self.cost,
         updates=updates,
         allow_input_downcast=True)
Пример #5
0
 def create_training_function(self):
     updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, method="adadelta")#这一步Gradient Decent!!!!
     self.update_fun = theano.function(
         inputs=[self.gfs,self.pm25in, self.pm25target,self.steps],
         outputs=self.cost,
         updates=updates,
         allow_input_downcast=True)
Пример #6
0
 def create_training_function(self):
     updates, gsums, xsums, lr, max_norm = create_optimization_updates(self.cost, self.params, lr=0.01, method="adagrad")#这一步Gradient Decent!!!!
     self.update_fun = theano.function(
         inputs=[self.x, self.target0,self.target1,self.steps],
         outputs=self.cost,
         updates=updates,
         name='update_fun',
         profile=False,
         allow_input_downcast=True)
Пример #7
0
    def _generate_train_model_function(self):
        u = T.lvector('u')
        i = T.lvector('i')
        ni = T.lvector('ni')        
        j = T.lvector('j')
        nj = T.lvector('nj')

        self.W = theano.shared(numpy.random.random(
            (self._n_users, self._rank)).astype('float32'), name='W')
        self.H = theano.shared(numpy.random.random(
            (self._n_items, self._rank)).astype('float32'), name='H')

        self.B = theano.shared(numpy.zeros(
            self._n_items).astype('float32'), name='B')

        x_ui = T.dot(self.W[u], self.H[i].T).diagonal() + self.B[i]
        x_uni = T.dot(self.W[u], self.H[ni].T).diagonal() + self.B[ni]
        x_uj = T.dot(self.W[u], self.H[j].T).diagonal() + self.B[j]
        x_unj = T.dot(self.W[u], self.H[nj].T).diagonal() + self.B[nj]

        obj = T.log(T.nnet.sigmoid(x_ui - x_uni)) + T.log(T.nnet.sigmoid(x_uj - x_unj))
        

        l2 = ((self.W[u] ** 2).sum(axis=1) +
            (self.H[i] ** 2).sum(axis=1) +
            (self.H[j] ** 2).sum(axis=1) +
            (self.H[ni] ** 2).sum(axis=1) +
            (self.H[nj] ** 2).sum(axis=1) +
            (self.B[i] ** 2 + self.B[j] ** 2 + self.B[ni] ** 2 + self.B[nj] ** 2))
        

        cost = - T.sum(obj - self._lambda * l2)

        g_cost_W = T.grad(cost=cost, wrt=self.W)
        g_cost_H = T.grad(cost=cost, wrt=self.H)
        g_cost_B = T.grad(cost=cost, wrt=self.B)
        sgd_updates = [(self.W, self.W - self._learning_rate * g_cost_W),
                       (self.H, self.H - self._learning_rate * g_cost_H),
                       (self.B, self.B - self._learning_rate * g_cost_B)]
        self.train_sgd = theano.function(
            inputs=[u, i, ni, j, nj], outputs=cost, updates=sgd_updates)

        ada_updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates(
            cost, [self.W, self.H, self.B], method="adadelta")
        self.train_ada = theano.function(
            inputs=[u, i, ni, j, nj], outputs=cost, updates=ada_updates, on_unused_input='warn')

        return True
Пример #8
0
def lstmTrain(examples,labels,input_size,num_iterations,steps,saveto=""):

    print examples,labels
    # Make a dataset where the network should learn whether the number 1 has been seen yet in the first column of
    # the input sequence.  This probably isn't really a good example use case for an LSTM, but it's simple.
    '''rng = np.random.RandomState(123456789)
    input_size = 2
    input_length = 3
    sample_size = 500
    num_iterations = 1
    examples = rng.choice([0,1], (1, input_length,2)).astype(theano.config.floatX)
    #labels = np.array([[1 if np.sum(np.abs(x[:y + 1])) > 5 else 0 for y in range(len(x))]
    #                   for x in examples],
    #                  dtype=theano.config.floatX)
    labels = np.array([[[1,0,1]]],
                      dtype=theano.config.floatX)'''
    hidden_layer_size = 10
    num_hidden_layers = 2
    nodes=len(labels)
    assert len(labels)==len(examples)
    model = StackedCells(input_size,
                         layers=[20,nodes],
                         activation=T.tanh,
                         celltype=LSTM)

    # Make the connections from the input to the first layer have linear activations.
    model.layers[0].in_gate2.activation = lambda x: x

    # Add an output layer to predict the labels for each time step.
    output_layer = Layer(nodes, nodes,lambda x: T.nnet.softmax(x)[0])
    model.layers.append(output_layer)
    #model.layers.append(Layer(3, 3, lambda x: T.nnet.softmax(x)[0]))
    #tensor.nnet.softmax(x)

    #pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
    #softmax_layer = Layer(3, 3, T.nnet.sigmoid)
    #softmax_layer.activation = lambda x: T.nnet.softmax(x)
    #model.layers.append(softmax_layer)
    #pred = T.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])



    def step(x, *prev_hiddens):
        activations = model.forward(x, prev_hiddens=prev_hiddens)
        return activations

    input_vec = T.matrix('input_vec')
    #input_mat=np.zeros((3,2))
    #input_mat=input_vec.dimshuffle((0,'x',1))
    #input_mat = input_vec.dimshuffle((0,'x')).eval({input_vec:examples[0]})
    #print input_mat

    result, _ = theano.scan(fn=step,
                            sequences=[input_vec],
                            #outputs_info=([dict(initial=input_vec, taps=[-1])] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]),
                            outputs_info=([dict(initial=hidden_layer.initial_hidden_state)
                                           for hidden_layer in model.layers[:-1]] +[dict(initial=model.layers[-1].bias_matrix)]),
                                          #[dict(initial=T.zeros_like(model.layers[-1].bias_matrix), taps=[-1])]),
                            n_steps=steps)
    #print result[0].eval({input_vec:examples[0]})
    #print model.layers[-1].eval({input_vec:examples[0]})
    #print result[-1].eval({input_vec:examples[0]})
    #print result[-1].T[0].eval({input_vec:examples[0]})
    #target = T.vector('target')
    target=T.vector('target',dtype='int64')

    prediction = result[-1]#.T[1]#.eval({examples:rng.choice([0,1], (1, input_length,2)).astype(theano.config.floatX),input_mat:np.zeros((3,2))})
    #cost = T.nnet.binary_crossentropy(prediction, target).mean()
    #pred = T.nnet.softmax(prediction)
    #print 'predict'
    #print pred.eval({input_vec:examples[0]})
    cost=-T.log(prediction[target] + 1e-8).mean()
    updates, _, _, _, _ = create_optimization_updates(cost, model.params)

    update_func = theano.function([input_vec, target], cost, updates=updates, allow_input_downcast=True,on_unused_input='warn')
    predict_func = theano.function([input_vec], prediction, allow_input_downcast=True,on_unused_input='warn')

    for cur_iter in range(num_iterations):
        for i, (example, label) in enumerate(zip(examples, labels)):
            #print i,example,label
            c = update_func(example, label)
            print "cost",c
            #create_optimization_updates(cost, model.params)
            #if i % 100 == 0:
            #    print "."#, end
        #print()
    if saveto:
        np.savez(saveto, model.params)
    '''test_cases = [np.array([[-1,1], [1,2],[0,0], [1,3], [2,-2]], dtype=theano.config.floatX)]
Пример #9
0
def main():
    # Make a dataset where the network should learn whether the number 1 has been seen yet in the first column of
    # the input sequence.  This probably isn't really a good example use case for an LSTM, but it's simple.
    rng = np.random.RandomState(123456789)
    input_size = 1
    input_length = 2
    sample_size = 1
    num_iterations = 3
    examples = rng.choice([-2, -1, 0, 1, 2], (sample_size, input_length)).astype(theano.config.floatX)
    labels = np.array([[1 if np.sum(np.abs(x[:y + 1])) > 5 else 0 for y in range(len(x))]
                       for x in examples],
                      dtype=theano.config.floatX)

    hidden_layer_size = 10
    num_hidden_layers = 2

    model = StackedCells(input_size,
                         layers=[hidden_layer_size] * num_hidden_layers,
                         activation=T.tanh,
                         celltype=LSTM)

    # Make the connections from the input to the first layer have linear activations.
    model.layers[0].in_gate2.activation = lambda x: x

    # Add an output layer to predict the labels for each time step.
    output_layer = Layer(hidden_layer_size, 1, T.nnet.sigmoid)
    model.layers.append(output_layer)

    def step(x, *prev_hiddens):
        activations = model.forward(x, prev_hiddens=prev_hiddens)
        return activations

    input_vec = T.vector('input_vec')
    input_mat = input_vec.dimshuffle((0, 'x'))

    result, _ = theano.scan(fn=step,
                            sequences=[input_mat],
                            outputs_info=([dict(initial=hidden_layer.initial_hidden_state, taps=[-1])
                                           for hidden_layer in model.layers[:-1]] +
                                          [dict(initial=T.zeros_like(model.layers[-1].bias_matrix), taps=[-1])]))

    print result[-2].eval({input_vec:examples[0]})
    target = T.vector('target')
    prediction = result[-1].T[0]

    cost = T.nnet.binary_crossentropy(prediction, target).mean()

    updates, _, _, _, _ = create_optimization_updates(cost, model.params)

    update_func = theano.function([input_vec, target], cost, updates=updates, allow_input_downcast=True)
    predict_func = theano.function([input_vec], prediction, allow_input_downcast=True)

    for cur_iter in range(num_iterations):
        for i, (example, label) in enumerate(zip(examples, labels)):
            c = update_func(example, label)
            #if i % 100 == 0:
                #print(".")
        print()

    test_cases = [np.array([-1, 1, 0, 1, -2, 0, 1, 0, 2, 0], dtype=theano.config.floatX),
                  np.array([2, 2, 2, 0, 0, 0], dtype=theano.config.floatX),
                  np.array([-2, -2, -2, 0, 0, 0], dtype=theano.config.floatX),
                  np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0], dtype=theano.config.floatX),
                  np.array([2, 0, 0, 0, 2, 0, 0, 0, 0, -2, 0, 0, 0, 0, 0], dtype=theano.config.floatX),
                  np.array([2, 2, 2, 0, 0, 0, 2, 2, 2, 0], dtype=theano.config.floatX)]


    for example in test_cases:
        print("input", "output")
        for x, pred in zip(example, predict_func(example)):
            print(x, "{:.3f}".format(pred))
        print()
Пример #10
0
    def _generate_train_model_function(self):
        """
          Generates the train model function in Theano.
          This is a straight port of the objective function
          described in the BPR paper.

          We want to learn a matrix factorisation

            U = W.H^T

          where U is the user-item matrix, W is a user-factor
          matrix and H is an item-factor matrix, so that
          it maximises the difference between
          W[u,:].H[i,:]^T and W[u,:].H[j,:]^T, 
          where `i` is a positive item
          (one the user `u` has watched) and `j` a negative item
          (one the user `u` hasn't watched).
        """
        u = T.lvector('u')
        i = T.lvector('i')
        j = T.lvector('j')

        self.W = theano.shared(numpy.random.random(
            (self._n_users, self._rank)).astype('float32'),
                               name='W')
        self.H = theano.shared(numpy.random.random(
            (self._n_items, self._rank)).astype('float32'),
                               name='H')

        self.B = theano.shared(numpy.zeros(self._n_items).astype('float32'),
                               name='B')

        x_ui = T.dot(self.W[u], self.H[i].T).diagonal() + self.B[i]
        x_uj = T.dot(self.W[u], self.H[j].T).diagonal() + self.B[j]
        # x_ui = T.dot(self.W[u], self.H[i].T).diagonal()
        # x_uj = T.dot(self.W[u], self.H[j].T).diagonal()

        x_uij = x_ui - x_uj

        obj_uij = T.sum(
            T.log(T.nnet.sigmoid(x_uij)) - self._lambda_u *
            (self.W[u]**2).sum(axis=1) - self._lambda_i *
            (self.H[i]**2).sum(axis=1) - self._lambda_j *
            (self.H[j]**2).sum(axis=1) - self._lambda_bias *
            (self.B[i]**2 + self.B[j]**2))
        cost = -obj_uij

        g_cost_W = T.grad(cost=cost, wrt=self.W)
        g_cost_H = T.grad(cost=cost, wrt=self.H)
        g_cost_B = T.grad(cost=cost, wrt=self.B)
        sgd_updates = [(self.W, self.W - self._learning_rate * g_cost_W),
                       (self.H, self.H - self._learning_rate * g_cost_H),
                       (self.B, self.B - self._learning_rate * g_cost_B)]
        self.train_sgd = theano.function(inputs=[u, i, j],
                                         outputs=cost,
                                         updates=sgd_updates)

        ada_updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates(
            cost, [self.W, self.H, self.B], method="adadelta")
        self.train_ada = theano.function(inputs=[u, i, j],
                                         outputs=cost,
                                         updates=ada_updates)
Пример #11
0
    def setup_train(self):

        # dimensions: (batch, time, notes, input_data) with input_data as in architecture
        self.input_mat = T.btensor4()
        # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic
        self.output_mat = T.btensor4()
        
        self.epsilon = np.spacing(np.float32(1.0))

        def step_time(in_data, *other):
            other = list(other)
            split = -len(self.t_layer_sizes) if self.dropout else len(other)
            hiddens = other[:split]
            masks = [None] + other[split:] if self.dropout else []
            new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks)
            return new_states
        
        def step_note(in_data, *other):
            other = list(other)
            split = -len(self.p_layer_sizes) if self.dropout else len(other)
            hiddens = other[:split]
            masks = [None] + other[split:] if self.dropout else []
            new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks)
            return new_states
        
        # We generate an output for each input, so it doesn't make sense to use the last output as an input.
        # Note that we assume the sentinel start value is already present
        # TEMP CHANGE: NO SENTINEL
        input_slice = self.input_mat[:,0:-1]
        n_batch, n_time, n_note, n_ipn = input_slice.shape
        
        # time_inputs is a matrix (time, batch/note, input_per_note)
        time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn))
        num_time_parallel = time_inputs.shape[1]
        
        # apply dropout
        if self.dropout > 0:
            time_masks = theano_lstm.MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout)
        else:
            time_masks = []

        time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers]
        time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info)
        
        self.time_thoughts = time_result
        
        # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about 
        # the hidden state of the last layer.
        # Transpose to be (note, batch/time, hidden_states)
        last_layer = get_last_layer(time_result)
        n_hidden = last_layer.shape[2]
        time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden))
        
        # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note.
        # In (note, batch/time, 2) format
        # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2)
        start_note_values = T.alloc(np.array(0,dtype=np.int8), 1, time_final.shape[1], 2 )
        correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2))
        note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0)
        
        # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in
        # one direction are the steps in the other, and vice versa.
        note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 )
        num_timebatch = note_inputs.shape[1]
        
        # apply dropout
        if self.dropout > 0:
            pitch_masks = theano_lstm.MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout)
        else:
            pitch_masks = []

        note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers]
        note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info)
        
        self.note_thoughts = note_result
        
        # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about 
        # the hidden state of the last layer.
        # Transpose to be (batch, time, note, onOrArticProb)
        note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3)
        
        # The cost of the entire procedure is the negative log likelihood of the events all happening.
        # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and
        # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1
        # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just
        # multiply the likelihoods, or, since we are logging them, add the logs.
        
        # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter
        # whether or not those are articulated.
        # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with 
        # (b,x,y,1) instead
        active_notes = T.shape_padright(self.output_mat[:,1:,:,0])
        mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3)
        
        loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon )
        self.cost = T.neg(T.sum(loglikelihoods))
        
        updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta")
        self.update_fun = theano.function(
            inputs=[self.input_mat, self.output_mat],
            outputs=self.cost,
            updates=updates,
            allow_input_downcast=True)

        self.update_thought_fun = theano.function(
            inputs=[self.input_mat, self.output_mat],
            outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost],
            allow_input_downcast=True)
Пример #12
0
    def setup_train(self):

        # dimensions: (batch, time, notes, input_data) with input_data as in architecture
        self.input_mat = T.btensor4()
        # dimensions: (batch, time, notes, onOrArtic) with 0:on, 1:artic
        self.output_mat = T.btensor4()

        self.epsilon = np.spacing(np.float32(1.0))

        print "model-setup-train::Trace-1"


        def step_time(in_data, *other):
            other = list(other)
            split = -len(self.t_layer_sizes) if self.dropout else len(other)
            hiddens = other[:split]
            masks = [None] + other[split:] if self.dropout else []
            new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks)
            return new_states

        def step_note(in_data, *other):
            other = list(other)
            split = -len(self.p_layer_sizes) if self.dropout else len(other)
            hiddens = other[:split]
            masks = [None] + other[split:] if self.dropout else []
            new_states = self.pitch_model.forward(in_data, prev_hiddens=hiddens, dropout=masks)
            return new_states

        # We generate an output for each input, so it doesn't make sense to use the last output as an input.
        # Note that we assume the sentinel start value is already present
        # TEMP CHANGE: NO SENTINEL

        print "model-setup-train::Trace-2"

        input_slice = self.input_mat[:,0:-1]
        n_batch, n_time, n_note, n_ipn = input_slice.shape

        # time_inputs is a matrix (time, batch/note, input_per_note)
        time_inputs = input_slice.transpose((1,0,2,3)).reshape((n_time,n_batch*n_note,n_ipn))
        num_time_parallel = time_inputs.shape[1]

        # apply dropout
        if self.dropout > 0:
            time_masks = MultiDropout( [(num_time_parallel, shape) for shape in self.t_layer_sizes], self.dropout)
        else:
            time_masks = []

        print "model-setup-train::Trace-3"

        time_outputs_info = [initial_state_with_taps(layer, num_time_parallel) for layer in self.time_model.layers]
        time_result, _ = theano.scan(fn=step_time, sequences=[time_inputs], non_sequences=time_masks, outputs_info=time_outputs_info)

        print "model-setup-train::Trace-4"


        self.time_thoughts = time_result

        # Now time_result is a list of matrix [layer](time, batch/note, hidden_states) for each layer but we only care about
        # the hidden state of the last layer.
        # Transpose to be (note, batch/time, hidden_states)
        last_layer = get_last_layer(time_result)
        n_hidden = last_layer.shape[2]
        time_final = get_last_layer(time_result).reshape((n_time,n_batch,n_note,n_hidden)).transpose((2,1,0,3)).reshape((n_note,n_batch*n_time,n_hidden))

        # note_choices_inputs represents the last chosen note. Starts with [0,0], doesn't include last note.
        # In (note, batch/time, 2) format
        # Shape of start is thus (1, N, 2), concatenated with all but last element of output_mat transformed to (x, N, 2)
        start_note_values = T.alloc(0, 1, time_final.shape[1], 2 )
        correct_choices = self.output_mat[:,1:,0:-1,:].transpose((2,0,1,3)).reshape((n_note-1,n_batch*n_time,2))
        note_choices_inputs = T.concatenate([start_note_values, correct_choices], axis=0)

        print "model-setup-train::Trace-5"


        # Together, this and the output from the last LSTM goes to the new LSTM, but rotated, so that the batches in
        # one direction are the steps in the other, and vice versa.
        note_inputs = T.concatenate( [time_final, note_choices_inputs], axis=2 )
        num_timebatch = note_inputs.shape[1]

        # apply dropout
        if self.dropout > 0:
            pitch_masks = MultiDropout( [(num_timebatch, shape) for shape in self.p_layer_sizes], self.dropout)
        else:
            pitch_masks = []

        print "model-setup-train::Trace-6"


        note_outputs_info = [initial_state_with_taps(layer, num_timebatch) for layer in self.pitch_model.layers]
        note_result, _ = theano.scan(fn=step_note, sequences=[note_inputs], non_sequences=pitch_masks, outputs_info=note_outputs_info)

        self.note_thoughts = note_result

        # Now note_result is a list of matrix [layer](note, batch/time, onOrArticProb) for each layer but we only care about
        # the hidden state of the last layer.
        # Transpose to be (batch, time, note, onOrArticProb)
        note_final = get_last_layer(note_result).reshape((n_note,n_batch,n_time,2)).transpose(1,2,0,3)

        print "model-setup-train::Trace-7"


        # The cost of the entire procedure is the negative log likelihood of the events all happening.
        # For the purposes of training, if the ouputted probability is P, then the likelihood of seeing a 1 is P, and
        # the likelihood of seeing 0 is (1-P). So the likelihood is (1-P)(1-x) + Px = 2Px - P - x + 1
        # Since they are all binary decisions, and are all probabilities given all previous decisions, we can just
        # multiply the likelihoods, or, since we are logging them, add the logs.

        # Note that we mask out the articulations for those notes that aren't played, because it doesn't matter
        # whether or not those are articulated.
        # The padright is there because self.output_mat[:,:,:,0] -> 3D matrix with (b,x,y), but we need 3d tensor with
        # (b,x,y,1) instead
        active_notes = T.shape_padright(self.output_mat[:,1:,:,0])
        mask = T.concatenate([T.ones_like(active_notes),active_notes], axis=3)

        loglikelihoods = mask * T.log( 2*note_final*self.output_mat[:,1:] - note_final - self.output_mat[:,1:] + 1 + self.epsilon )

        print "model-setup-train::Trace-8"

        self.cost = T.neg(T.sum(loglikelihoods))

        print "model-setup-train::Trace-9"

        updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta")

        print "model-setup-train::Trace-10"

        self.update_fun = theano.function(
            inputs=[self.input_mat, self.output_mat],
            outputs=self.cost,
            updates=updates,
            allow_input_downcast=True)


        self.update_thought_fun = theano.function(
            inputs=[self.input_mat, self.output_mat],
            outputs= ensure_list(self.time_thoughts) + ensure_list(self.note_thoughts) + [self.cost],
            allow_input_downcast=True)
    def __init__(self,
                 word_size,
                 vocabulary_size,
                 stack_size,
                 hidden_size,
                 hidden_price_size,
                 price_stack_size,
                 output_vocabulary,
                 index2word,
                 word2index,
                 index2category,
                 category2index,
                 memory_sparsity = 0.0001,
                 rho = 0.95,
                 verbose=False,
                 theano_mode = "FAST_RUN"):
        

        self.index2word = index2word
        self.word2index = word2index
        self.index2category = index2category
        self.category2index = category2index

        self.memory_sparsity= theano.shared(np.float64(memory_sparsity), name="memory_sparsity")
        self.theano_mode = theano_mode
        self.word_size = word_size
        self.vocabulary_size = theano.shared(np.int32(vocabulary_size), name="vocabulary_size")
        self.stack_size = stack_size
        self.hidden_size = hidden_size
        self.output_vocabulary = output_vocabulary
        
        ### CREATE THE CELLS:
        
        model = theano_lstm.StackedCells(word_size, layers=[hidden_size] * stack_size, celltype=theano_lstm.LSTM, activation=T.tanh)
        # add a softmax layer at the end (non-recurrent)
        
        # special end token:
        model.layers.append(theano_lstm.Layer(hidden_size, output_vocabulary + 1, to_softmax))
        
        # add an embedding layer at the beginning (non-recurrent):
        model.layers = [theano_lstm.Embedding(vocabulary_size + output_vocabulary + 1, word_size),
                        theano_lstm.GatedInput(word_size, hidden_size, T.nnet.sigmoid)] + model.layers
        self.model = model
        
        model2 = theano_lstm.StackedCells(hidden_size, layers=[hidden_price_size] * (price_stack_size - 1) + [1], celltype=theano_lstm.Layer, activation=T.tanh)
        
        # price is a linear function of its inputs:
        model2.layers[-1].activation = T.exp
        
        self.price_model = model2
        
        
        ### CONSTRUCT THE PREDICTION / WIRING:
        
        def step(word_id, *prev_hiddens):
            if prev_hiddens[-1].ndim > 1:
                top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
            else:
                top_level_activ = prev_hiddens[-1][self.hidden_size:]
            
            new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
            # all outputs should be returned, except embeddings, and the first gates
            return new_state[1:]
        
        def pred_step(word_id, *prev_hiddens):
            if prev_hiddens[-1].ndim > 1:
                top_level_activ = prev_hiddens[-1][:, self.hidden_size:]
            else:
                top_level_activ = prev_hiddens[-1][self.hidden_size:]
            new_state = model.forward(word_id, [None, top_level_activ] + list(prev_hiddens), [])
            # all outputs should be returned, except embeddings, and the first gates
            return [T.cast(new_state[-1].argmax() + self.vocabulary_size, dtype='int32')] + new_state[2:-1]
        
        def predict_sequence(x, lengths, return_all=False, return_memory=False):
            if x.ndim > 1:
                outputs_info = [None] + [dict(initial=T.repeat(T.shape_padleft(layer.initial_hidden_state), x.shape[0], axis=0), taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
            else:
                outputs_info = [None] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]
            outputs_info = outputs_info + [None]
            result, updates = theano.scan(step,
                                          sequences = [x.T if x.ndim > 1 else x],
                                          outputs_info = outputs_info)

            if return_all:
                return result
            else:
                res = result[-1].dimshuffle(1, 0, 2) if x.ndim > 1 else result[-1]
                
                price_preds = self.price_model.forward(
                                self.model.layers[-2].postprocess_activation(
                                    result[-2][lengths, T.arange(0, lengths.shape[0])]
                                ), None, []
                            )[-1][:,0] if x.ndim > 1 else \
                            self.price_model.forward(
                                self.model.layers[-2].postprocess_activation(
                                    result[-2][-1]
                            ), None, [])[-1][0]
                # gate values can be obtained by asking for them from the stacked cells
                if return_memory:
                    return result[0], res, price_preds
                else:
                    return res, price_preds
        
        
        # every sequence is a series of indices
        # for words:
        input_sentences    = T.imatrix()
        
        # some sequences are shorter than others, so we'll note where they
        # end in a zero-indexed fashion
        sequence_lengths  = T.ivector()
        sequence_starts   = T.ivector()
        # the labels are integers in the range of dictionary
        
        self.input_sentences = input_sentences
        self.sequence_lengths = sequence_lengths
        self.sequence_starts = sequence_starts
        
        self.prices = T.vector()
        
        memory_usage, self.predictions, self.price_predictions = predict_sequence(input_sentences, self.sequence_starts, return_memory=True)
        
        self.error = (
            theano_lstm.masked_loss(
                self.predictions,
                input_sentences[:,1:] - self.vocabulary_size,
                sequence_lengths,
                sequence_starts).mean() +
            (memory_usage.sum() * self.memory_sparsity) / input_sentences.shape[0] +
            ((self.price_predictions - self.prices)**2).mean()
        )
        
        self.memory_fun = theano.function([input_sentences], memory_usage,
                                           allow_input_downcast=True,
                                           mode=self.theano_mode)
        
        self.price_predict_fun = theano.function([input_sentences, sequence_starts],
                                           self.price_predictions,
                                           allow_input_downcast=True,
                                           mode=self.theano_mode)
        
        self.predict_fun = theano.function([input_sentences],
                                           self.predictions,
                                           allow_input_downcast=True,
                                           mode=self.theano_mode)
        self.error_fun = theano.function([input_sentences, sequence_lengths, sequence_starts, self.prices],
                                         self.error,
                                         allow_input_downcast=True,
                                         mode=self.theano_mode)
        
        self.input_sentence = T.ivector()
        
        prep_result = predict_sequence(self.input_sentence, None, return_all=True)
        
        pred_outputs_info = [dict(initial=self.input_sentence[-1], taps=[-1])] + [dict(initial=prep_hidden[-1], taps=[-1]) for prep_hidden in prep_result[1:-1]]
        
        prediction_steps = T.iscalar()
        pred_result, _ = theano.scan(pred_step,
                                     n_steps = prediction_steps,
                                     outputs_info = pred_outputs_info)
        
        self.reconstruct_fun = theano.function([self.input_sentence, prediction_steps],
                                               pred_result[0],
                                               allow_input_downcast=True,
                                               mode=self.theano_mode)
        self.input_labels = theano.function([input_sentences],
                                            input_sentences[:,1:] - self.vocabulary_size,
                                            mode=self.theano_mode)
        
        if verbose:
            print("created prediction & error functions")
        updates, gsums, xsums, lr, max_norm = theano_lstm.create_optimization_updates(self.error, model.params + model2.params, max_norm=None, rho=rho, method="adadelta")
        self.lr = lr
        if verbose:
            print("took the gradient")
        
        self.gsums = gsums
        self.xsums = xsums
        
        self.update_fun = theano.function([input_sentences, sequence_lengths, sequence_starts, self.prices],
                                          outputs=None,
                                          updates=updates,
                                          mode=self.theano_mode)
        if verbose:
            print("created the gradient descent function")
    def setup_train(self):
        print('{:25}'.format("Setup Train"), end='', flush=True)

        self.input_mat = T.btensor4()
        self.output_mat = T.btensor4()

        def step_time(in_data, *other):
            other = list(other)
            split = -len(self.t_layer_sizes) if self.dropout else len(other)
            hiddens = other[:split]
            masks = [None] + other[split:] if self.dropout else []
            new_states = self.time_model.forward(in_data,
                                                 prev_hiddens=hiddens,
                                                 dropout=masks)
            return new_states

        def step_note(in_data, *other):
            other = list(other)
            split = -len(self.p_layer_sizes) if self.dropout else len(other)
            hiddens = other[:split]
            masks = [None] + other[split:] if self.dropout else []
            new_states = self.pitch_model.forward(in_data,
                                                  prev_hiddens=hiddens,
                                                  dropout=masks)
            return new_states

        def get_dropout(layers, num_time_parallel=1):
            if self.dropout > 0:
                return theano_lstm.MultiDropout([(num_time_parallel, shape)
                                                 for shape in layers],
                                                self.dropout)
            else:
                return []

        # TIME PASS
        input_slice = self.input_mat[:, 0:-1]
        n_batch, n_time, n_note, n_ipn = input_slice.shape
        time_inputs = input_slice.transpose((1, 0, 2, 3)).reshape(
            (n_time, n_batch * n_note, n_ipn))

        time_masks = get_dropout(self.t_layer_sizes, time_inputs.shape[1])
        time_outputs_info = [
            initial_state_with_taps(layer, time_inputs.shape[1])
            for layer in self.time_model.layers
        ]
        time_result, _ = theano.scan(fn=step_time,
                                     sequences=[time_inputs],
                                     non_sequences=time_masks,
                                     outputs_info=time_outputs_info)
        self.time_thoughts = time_result

        last_layer = get_last_layer(time_result)
        n_hidden = last_layer.shape[2]
        time_final = get_last_layer(time_result).reshape(
            (n_time, n_batch, n_note, n_hidden)).transpose(
                (2, 1, 0, 3)).reshape((n_note, n_batch * n_time, n_hidden))

        # PITCH PASS
        start_note_values = T.alloc(np.array(0, dtype=np.int8), 1,
                                    time_final.shape[1], self.output_size)
        correct_choices = self.output_mat[:, 1:, 0:-1, :].transpose(
            (2, 0, 1, 3)).reshape(
                (n_note - 1, n_batch * n_time, self.output_size))
        note_choices_inputs = T.concatenate(
            [start_note_values, correct_choices], axis=0)

        note_inputs = T.concatenate([time_final, note_choices_inputs], axis=2)

        note_masks = get_dropout(self.p_layer_sizes, note_inputs.shape[1])
        note_outputs_info = [
            initial_state_with_taps(layer, note_inputs.shape[1])
            for layer in self.pitch_model.layers
        ]
        note_result, _ = theano.scan(fn=step_note,
                                     sequences=[note_inputs],
                                     non_sequences=note_masks,
                                     outputs_info=note_outputs_info)

        self.note_thoughts = note_result

        note_final = get_last_layer(note_result).reshape(
            (n_note, n_batch, n_time, self.output_size)).transpose(1, 2, 0, 3)

        self.cost = self.loss_func(self.output_mat[:, 1:], note_final)

        updates, _, _, _, _ = create_optimization_updates(self.cost,
                                                          self.params,
                                                          method="adadelta")
        self.update_fun = theano.function(
            inputs=[self.input_mat, self.output_mat],
            outputs=self.cost,
            updates=updates,
            allow_input_downcast=True)

        print("Done")
Пример #15
0
    def __init__(self,
            hidden_size,
            internal_features,
            intermediate_size,
            vocab_size,
            num_answers,
            tensor=True,
            method="sgd"):
        self.text_embedding = Embedding(vocab_size, hidden_size)
        self.question_embedding = Embedding(vocab_size, hidden_size)
        self.answer_embedding = Embedding(vocab_size, hidden_size)
        self.params = self.text_embedding.params + self.question_embedding.params + self.answer_embedding.params
        self.tensor = tensor
        if tensor:
            self.q_form_U  = create_shared("question_answer_tensor",
                                           intermediate_size,
                                           internal_features,
                                           3 * hidden_size)

            self.q_form_V  = create_shared("question_answer_tensor",
                                           intermediate_size,
                                           internal_features,
                                           3 * hidden_size)


            self.params.append(self.q_form_U)
            self.params.append(self.q_form_V)

        # here are the affine parameters
        self.bias           = create_shared("bias", intermediate_size)
        self.projection_mat = create_shared("projection_mat",
                                           intermediate_size,
                                           3 * hidden_size)

        self.scoring_mat    = create_shared("scoring_mat",
                                            1,
                                            intermediate_size)

        self.params += [
            self.bias,
            self.projection_mat
        ]

        # create a triplet scoring function:
        sentence = T.ivector()
        question = T.ivector()
        answer   = T.ivector()
        self.score_triplet = theano.function([sentence, question, answer],
            self.get_score(sentence, question, answer),
            allow_input_downcast=True)


        # create an error function
        answers = [T.ivector() for i in range(num_answers)]
        targets = [T.fscalar() for i in range(num_answers)]
        answer_targets = []
        for a, t in zip(answers, targets):
            answer_targets.extend([a, t])

        error = self.get_error(
                sentence,
                question,
                *answer_targets)
        self.error_fun = theano.function([
            sentence,
            question] + answer_targets,
            error,
            allow_input_downcast=True)

        gparams = T.grad(error, self.params,
            disconnected_inputs='ignore')
        updates = OrderedDict()

        self.gradient_caches = [theano.shared(param.get_value(True, True) * 0.0, borrow=True, name=param.name + "_grad")
                        for param in self.params]

        for gparam_cache, gparam in zip(self.gradient_caches, gparams):
            updates[gparam_cache] = gparam_cache + gparam

        self.update_gradient = theano.function([
                sentence,
                question] + answer_targets,
                error,
                updates=updates, allow_input_downcast=True)

        # create a training function:
        true_updates, self.gsums, self.xsums, lr, max_norm = create_optimization_updates(
            None,
            self.params,
            method=method,
            gradients=self.gradient_caches
            )
        self.lr = lr

        for gparam_cache in self.gradient_caches:
            true_updates[gparam_cache] = T.zeros_like(gparam_cache)

        self.apply_gradient = theano.function(
            inputs  = [],
            outputs = [],
            updates = true_updates)
    def __init__(self, hidden_size, vocab_size, num_answers):
        self.embedding = Embedding(vocab_size, hidden_size)
        self.q_form    = create_shared("tensor",
                                       1,
                                       hidden_size,
                                       hidden_size)

        self.params = self.embedding.params + [self.q_form]

        # create a triplet scoring function:
        sentence = T.ivector()
        question = T.ivector()
        answer   = T.ivector()
        self.score_triplet = theano.function([sentence, question, answer],
            self.get_score(sentence, question, answer),
            allow_input_downcast=True)


        # create an error function
        answers = [T.ivector() for i in range(num_answers)]
        targets = [T.fscalar() for i in range(num_answers)]
        answer_targets = []
        for a, t in zip(answers, targets):
            answer_targets.extend([a, t])

        error = self.get_error(
                sentence,
                question,
                *answer_targets)
        self.error_fun = theano.function([
            sentence,
            question] + answer_targets,
            error, allow_input_downcast=True)

        gparams = T.grad(error, self.params)
        updates = OrderedDict()

        self.gradient_caches = [theano.shared(param.get_value(True, True) * 0.0, borrow=True, name=param.name + "_grad")
                        for param in self.params]

        for gparam_cache, gparam in zip(self.gradient_caches, gparams):
            updates[gparam_cache] = gparam_cache + gparam

        self.update_gradient = theano.function([
                sentence,
                question] + answer_targets,
                error,
                updates=updates, allow_input_downcast=True)

        # create a training function:
        true_updates, gsums, xsums, lr, max_norm = create_optimization_updates(
            None,
            self.params,
            method="sgd",
            gradients=self.gradient_caches
            )
        self.lr = lr

        for gparam_cache in self.gradient_caches:
            true_updates[gparam_cache] = T.zeros_like(gparam_cache)

        self.apply_gradient = theano.function(
            inputs  = [],
            outputs = [],
            updates = true_updates)