示例#1
0
class NeuralNet:

    def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out):
        '''
        Initialize a NeuralNet

        @param input_shape: tuple or list of length 4 , (batch size, num input feature maps,
                             image height, image width)
        @param filter_shapes: list of 2 (for each conv layer) * 4 values (number of filters, num input feature maps,
                              filter height,filter width)
        @param strides: list of size 2, stride values for each hidden layer
        @param n_hidden: int, number of neurons in the all-to-all connected hidden layer
        @param n_out: int, number od nudes in output layer
        '''

        #create theano variables corresponding to input_batch (x) and output of the network (y)
        x = T.ftensor4('x')
        y = T.fmatrix('y')

        #first hidden layer is convolutional:
        self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0], input_shape, strides[0])

        #second convolutional hidden layer: the size of input depends on the size of output from first layer
        #it is defined as (num_batches, num_input_feature_maps, height_of_input_maps, width_of_input_maps)
        second_conv_input_shape = [input_shape[0], filter_shapes[0][0], self.layer_hidden_conv1.feature_map_size,
                                   self.layer_hidden_conv1.feature_map_size]
        self.layer_hidden_conv2 = ConvolutionalLayer(self.layer_hidden_conv1.output, filter_shapes[1],
                                                     image_shape=second_conv_input_shape, stride=2)

        #output from convolutional layer is 4D, but normal hidden layer expects 2D. Because of all to all connections
        # 3rd hidden layer does not care from which feature map or from which position the input comes from
        flattened_input = self.layer_hidden_conv2.output.flatten(2)

        #create third hidden layer
        self.layer_hidden3 = HiddenLayer(flattened_input, self.layer_hidden_conv2.fan_out, n_hidden)

        #create output layer
        self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden, n_out)

        #define the ensemble of parameters of the whole network
        self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \
            + self.layer_hidden3.params + self.layer_output.params

        #discount factor
        self.gamma = 0.95

        #: define regularization terms, for some reason we only take in count the weights, not biases)
        #  linear regularization term, useful for having many weights zero
        self.l1 = abs(self.layer_hidden_conv1.W).sum() \
            + abs(self.layer_hidden_conv2.W).sum() \
            + abs(self.layer_hidden3.W).sum() \
            + abs(self.layer_output.W).sum()

        #: square regularization term, useful for forcing small weights
        self.l2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \
            + (self.layer_hidden_conv2.W ** 2).sum() \
            + (self.layer_hidden3.W ** 2).sum() \
            + (self.layer_output.W ** 2).sum()

        #: define the cost function
        cost = 0.0 * self.l1 + 0.0 * self.l2_sqr + self.layer_output.errors(y)

        #: define gradient calculation
        grads = T.grad(cost, self.params)

        #: Define how much we need to change the parameter values
        learning_rate = 0.0001
        updates = []
        for param_i, gparam_i in zip(self.params, grads):
            updates.append((param_i, param_i - learning_rate * gparam_i))

        #: we need another set of theano variables (other than x and y) to use in train and predict functions
        temp_x = T.ftensor4('temp_x')
        temp_y = T.fmatrix('temp_y')

        #: define the training operation as applying the updates calculated given temp_x and temp_y
        self.train_model = theano.function(inputs=[temp_x, temp_y],
                                           outputs=[cost , self.params[0][0]],
                                           updates=updates,
                                           givens={
                                               x: temp_x,
                                               y: temp_y})

        self.predict_rewards = theano.function(
            inputs=[temp_x],
            outputs=[self.layer_output.output],
            givens={
                x: temp_x
            })


        self.predict_rewards_and_cost = theano.function(
            inputs=[temp_x, temp_y],
            outputs=[self.layer_output.output, cost],
            givens={
                x: temp_x,
                y: temp_y
            })

    def train(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: array of dictionaries, each dictionary contains
        one transition (prestate,action,reward,poststate)
        """

        #: we have a new, better estimation for the Q-val of the action we chose, it is the sum of the reward
        #  received on transition and the maximum of future rewards. Q-s for other actions remain the same.
        for i, transition in enumerate(minibatch):
            estimated_Q = self.predict_rewards([transition['prestate']])[0][0]

            #: line prints out the output of the network, uncomment it if you want to verify that different
            #  inputs give different outputs (c.f. wiki Basic tests/Issue #10)
            #print "estimated q", estimated_Q

            estimated_Q[transition['action']] = transition['reward'] + self.gamma \
                                                * np.max(self.predict_rewards([transition['poststate']]))
            #: knowing what estimated_Q looks like, we can train the model
            cost, first_filter = self.train_model([transition['prestate']], [estimated_Q])

            #: next line prints out the weight values in the first line of the first 8x8 filter in first conv layer,
            #  uncomment it if you want to make sure the weight values do indeed change as the result of learning
            #  (c.f. wiki Basic tests/Issue #7)
            #print "first line of filter applied to first img of first layer is:  \n", first_filter[0][0]

    def predict_best_action(self, state):
        """
        Predict_best_action returns the action with the highest Q-value
        @param state: 4D array, input (game state) for which we want to know the best action
        """
        predicted_values_for_actions = self.predict_rewards(state)[0][0]
        #print "predicted best action", predicted_values_for_actions
        return  np.argmax(predicted_values_for_actions)
class NeuralNet:
    def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out):
        '''
        Initialize a NeuralNet

        @param input_shape: tuple or list of length 4 , (batch size, num input feature maps,
                             image height, image width)
        @param filter_shapes: list of 2 (for each conv layer) * 4 values (number of filters, num input feature maps,
                              filter height,filter width)
        @param strides: list of size 2, stride values for each hidden layer
        @param n_hidden: int, number of neurons in the all-to-all connected hidden layer
        @param n_out: int, number od nudes in output layer
        '''

        #create theano variables corresponding to input_batch (x) and output of the network (y)
        x = T.ftensor4('x')
        y = T.fmatrix('y')

        #first hidden layer is convolutional:
        self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0],
                                                     input_shape, strides[0])

        #second convolutional hidden layer: the size of input depends on the size of output from first layer
        #it is defined as (num_batches, num_input_feature_maps, height_of_input_maps, width_of_input_maps)
        second_conv_input_shape = [
            input_shape[0], filter_shapes[0][0],
            self.layer_hidden_conv1.feature_map_size,
            self.layer_hidden_conv1.feature_map_size
        ]
        self.layer_hidden_conv2 = ConvolutionalLayer(
            self.layer_hidden_conv1.output,
            filter_shapes[1],
            image_shape=second_conv_input_shape,
            stride=2)  # Drops use of strides

        #output from convolutional layer is 4D, but normal hidden layer expects 2D. Because of all to all connections
        # 3rd hidden layer does not care from which feature map or from which position the input comes from
        flattened_input = self.layer_hidden_conv2.output.flatten(2)

        #create third hidden layer
        self.layer_hidden3 = HiddenLayer(flattened_input,
                                         self.layer_hidden_conv2.fan_out,
                                         n_hidden)

        #create output layer
        self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden,
                                        n_out)

        #define the ensemble of parameters of the whole network
        self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \
            + self.layer_hidden3.params + self.layer_output.params

        #discount factor
        self.gamma = 0.95

        #: define regularization terms, for some reason we only take in count the weights, not biases)
        #  linear regularization term, useful for having many weights zero
        self.l1 = abs(self.layer_hidden_conv1.W).sum() \
            + abs(self.layer_hidden_conv2.W).sum() \
            + abs(self.layer_hidden3.W).sum() \
            + abs(self.layer_output.W).sum()

        #: square regularization term, useful for forcing small weights
        self.l2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \
            + (self.layer_hidden_conv2.W ** 2).sum() \
            + (self.layer_hidden3.W ** 2).sum() \
            + (self.layer_output.W ** 2).sum()

        #: define the cost function
        self.cost = 0.0 * self.l1 + 0.0 * self.l2_sqr + self.layer_output.errors(
            y)
        self.cost_function = theano.function([x, y], [self.cost])

        #: define gradient calculation
        self.grads = T.grad(self.cost, self.params)

        #: Define how much we need to change the parameter values
        self.learning_rate = T.scalar('lr')
        self.updates = []
        for param_i, gparam_i in zip(self.params, self.grads):
            self.updates.append(
                (param_i, param_i - self.learning_rate * gparam_i))
        self.x = x
        self.y = y

        #: we need another set of theano variables (other than x and y) to use in train and predict functions
        temp_x = T.ftensor4('temp_x')
        temp_y = T.fmatrix('temp_y')

        #: define the training operation as applying the updates calculated given temp_x and temp_y
        self.train_model = theano.function(inputs=[
            temp_x, temp_y,
            theano.Param(self.learning_rate, default=0.00001)
        ],
                                           outputs=[self.cost],
                                           updates=self.updates,
                                           givens={
                                               x: temp_x,
                                               y: temp_y
                                           },
                                           name='train_model')

        self.cost_clone = theano.clone(self.cost, replace=self.updates)
        self.line_function = theano.function([x, y, self.learning_rate],
                                             [self.cost_clone])

        self.predict_rewards = theano.function(
            inputs=[temp_x],
            outputs=[self.layer_output.output],
            givens={x: temp_x},
            name='predict_rewards')

        self.predict_rewards_and_cost = theano.function(
            inputs=[temp_x, temp_y],
            outputs=[self.layer_output.output, self.cost],
            givens={
                x: temp_x,
                y: temp_y
            },
            name='predict_rewards_and_cost')

    actual_learning_rate = 1e-5
    learning_rates = []

    def optimal_learning_rate(self, prestates, new_estimated_Q, lr):
        objective = lambda lr: self.line_function(np.array(
            prestates), new_estimated_Q, float(lr))[0]
        res = scipy.optimize.minimize(objective,
                                      0,
                                      method='Nelder-Mead',
                                      options={'xtol': 1e-1})
        print 'optimization result'
        print res
        self.learning_rates.append(max(1e-6, float(res.x)))

    def train(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: array of dictionaries, each dictionary contains
        one transition (prestate,action,reward,poststate)
        """
        prestates = [t['prestate'] for t in minibatch]
        initial_estimated_Q = self.predict_rewards(prestates)[0]
        new_estimated_Q = initial_estimated_Q.copy()
        poststates = [t['poststate'] for t in minibatch]
        post_eQ = [
            self.predict_rewards([s])[0] if s is not None else None
            for s in poststates
        ]
        actions = [t['action'] for t in minibatch]
        game_end_ps = [t['game_end'] for t in minibatch]
        rewards = np.array([t['reward'] for t in minibatch])
        for row, (peQ, action, reward, game_end) in enumerate(
                zip(post_eQ, actions, rewards, game_end_ps)):
            new_estimated_Q[row,
                            action] = reward + (0 if game_end else self.gamma *
                                                np.max(peQ))
        initial_cost = self.cost_function(prestates, new_estimated_Q)
        optimal_learning_rate = lambda: self.optimal_learning_rate(
            prestates, new_estimated_Q, self.learning_rates[-1]
            if self.learning_rates else self.actual_learning_rate)
        if (len(self.learning_rates) % 50) == 0:
            print 'computing optimal learning rate'
            optimal_learning_rate()
        else:
            self.learning_rates.append(self.learning_rates[-1])
        self.train_model(np.array(prestates), new_estimated_Q,
                         self.learning_rates[-1])
        final_cost = self.cost_function(prestates, new_estimated_Q)
        final_estimated_Q = self.predict_rewards(prestates)[0]
        print 'initial_cost', initial_cost, 'final_cost', final_cost, 'foo baz'
        print 'current rewards', (final_estimated_Q -
                                  final_estimated_Q.min(axis=0)).mean(axis=0)
        print 'current rewards absolute'
        for r, a, s in sorted(
                zip(rewards, actions, map(list, final_estimated_Q))):
            print r, a, s
        if final_cost > initial_cost:
            print 'overstepped; computing current optimal learning rate'
            optimal_learning_rate()
        if os.path.exists('/var/tmp/stop'):
            import pdb
            pdb.set_trace()

    def predict_best_action(self, state):
        """
        Predict_best_action returns the action with the highest Q-value
        @param state: 4D array, input (game state) for which we want to know the best action
        """
        predicted_values_for_actions = self.predict_rewards(state)[0][0]
        return np.argmax(predicted_values_for_actions)
class NeuralNet:

    def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out):

        x = T.dtensor4('x')
        y = T.dmatrix('y')

        self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0], input_shape, strides[0])


        second_conv_input_shape=[input_shape[0], filter_shapes[0][0], self.layer_hidden_conv1.feature_map_size, self.layer_hidden_conv1.feature_map_size]
        self.layer_hidden_conv2 = ConvolutionalLayer(self.layer_hidden_conv1.output, filter_shapes[1],
                                                     image_shape=second_conv_input_shape, stride=2)

        flattened_input=self.layer_hidden_conv2.output.flatten(2)

        self.layer_hidden3 = HiddenLayer(flattened_input, self.layer_hidden_conv2.fan_out, n_hidden)
        self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden, n_out)
        self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \
                    + self.layer_hidden3.params + self.layer_output.params

        self.gamma = 0.95

        self.L1 = abs(self.layer_hidden_conv1.W).sum() \
                + abs(self.layer_hidden_conv2.W).sum() \
                + abs(self.layer_hidden3.W).sum()  \
                + abs(self.layer_output.W).sum()

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \
                    + (self.layer_hidden_conv2.W ** 2).sum() \
                    + (self.layer_hidden3.W ** 2).sum() \
                    + (self.layer_output.W ** 2).sum()



        cost = 0.0*self.L1 + 0.0*self.L2_sqr + self.layer_output.errors(y)

        grads = T.grad(cost, self.params)

         # Define how much we need to change the parameter values
        learning_rate = 0.01
        updates = []
        for param_i, gparam_i in zip(self.params, grads):
            updates.append((param_i, param_i - learning_rate * gparam_i))

        temp1 = T.dtensor4('temp1')
        temp2 = T.dmatrix('temp2')


        self.train_model = theano.function(inputs=[temp1, temp2], outputs=[cost],
            updates=updates,
            givens={
                x: temp1,
                y: temp2})

        #self.shared_q = theano.shared(np.zeros((32,4)))
        #self.shared_s = theano.shared(np.zeros((32,4,84,84)))
        #self.train_model_shared = theano.function(inputs=[], outputs=[cost],
        #    updates=updates,
        #    givens={
        #        x: self.shared_s,
        #        y: self.shared_q
        #    })


        self.predict_rewards = theano.function(
            inputs=[temp1],
            outputs=[self.layer_output.output],
            givens={
                x: temp1
            })

        self.predict_rewards_and_cost = theano.function(
            inputs=[temp1, temp2],
            outputs=[self.layer_output.output, cost],
            givens={
                x: temp1,
                y: temp2
            })



    def train(self, minibatch):
        states = []
        expected_Qs = []
        states1 = [element['prestate'] for element in minibatch]
        states2 = [element['poststate'] for element in minibatch]
        current_predicted_rewards = self.predict_rewards(states1)[0]

        predicted_future_rewards = self.predict_rewards(states2)[0]
        for i, transition in enumerate(minibatch):
            rewards = current_predicted_rewards[i]
            rewards[transition['action']] = transition['reward'] + self.gamma*np.max(predicted_future_rewards[i])
            states.append(transition['prestate'])
            expected_Qs.append(rewards)

        #self.shared_s = theano.shared(states)
        #self.shared_q = theano.shared(expected_Qs)
        #print "expected", expected_Qs[0]
        #print "expected", self.shared_q.eval()[0]
        #print self.predict_rewards_and_cost(self.shared_s.eval(),self.shared_q.eval())[0][0]

        #return self.train_model_shared()
        self.train_model(states, expected_Qs)
class NeuralNet:
    def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out):

        x = T.dtensor4('x')
        y = T.dmatrix('y')

        self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0],
                                                     input_shape, strides[0])

        second_conv_input_shape = [
            input_shape[0], filter_shapes[0][0],
            self.layer_hidden_conv1.feature_map_size,
            self.layer_hidden_conv1.feature_map_size
        ]
        self.layer_hidden_conv2 = ConvolutionalLayer(
            self.layer_hidden_conv1.output,
            filter_shapes[1],
            image_shape=second_conv_input_shape,
            stride=2)

        flattened_input = self.layer_hidden_conv2.output.flatten(2)

        self.layer_hidden3 = HiddenLayer(flattened_input,
                                         self.layer_hidden_conv2.fan_out,
                                         n_hidden)
        self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden,
                                        n_out)
        self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \
                    + self.layer_hidden3.params + self.layer_output.params

        self.gamma = 0.95

        self.L1 = abs(self.layer_hidden_conv1.W).sum() \
                + abs(self.layer_hidden_conv2.W).sum() \
                + abs(self.layer_hidden3.W).sum()  \
                + abs(self.layer_output.W).sum()

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \
                    + (self.layer_hidden_conv2.W ** 2).sum() \
                    + (self.layer_hidden3.W ** 2).sum() \
                    + (self.layer_output.W ** 2).sum()

        cost = 0.0 * self.L1 + 0.0 * self.L2_sqr + self.layer_output.errors(y)

        grads = T.grad(cost, self.params)

        # Define how much we need to change the parameter values
        learning_rate = 0.01
        updates = []
        for param_i, gparam_i in zip(self.params, grads):
            updates.append((param_i, param_i - learning_rate * gparam_i))

        temp1 = T.dtensor4('temp1')
        temp2 = T.dmatrix('temp2')

        self.train_model = theano.function(inputs=[temp1, temp2],
                                           outputs=[cost],
                                           updates=updates,
                                           givens={
                                               x: temp1,
                                               y: temp2
                                           })

        #self.shared_q = theano.shared(np.zeros((32,4)))
        #self.shared_s = theano.shared(np.zeros((32,4,84,84)))
        #self.train_model_shared = theano.function(inputs=[], outputs=[cost],
        #    updates=updates,
        #    givens={
        #        x: self.shared_s,
        #        y: self.shared_q
        #    })

        self.predict_rewards = theano.function(
            inputs=[temp1],
            outputs=[self.layer_output.output],
            givens={x: temp1})

        self.predict_rewards_and_cost = theano.function(
            inputs=[temp1, temp2],
            outputs=[self.layer_output.output, cost],
            givens={
                x: temp1,
                y: temp2
            })

    def train(self, minibatch):
        states = []
        expected_Qs = []
        states1 = [element['prestate'] for element in minibatch]
        states2 = [element['poststate'] for element in minibatch]
        current_predicted_rewards = self.predict_rewards(states1)[0]

        predicted_future_rewards = self.predict_rewards(states2)[0]
        for i, transition in enumerate(minibatch):
            rewards = current_predicted_rewards[i]
            rewards[transition['action']] = transition[
                'reward'] + self.gamma * np.max(predicted_future_rewards[i])
            states.append(transition['prestate'])
            expected_Qs.append(rewards)

        #self.shared_s = theano.shared(states)
        #self.shared_q = theano.shared(expected_Qs)
        #print "expected", expected_Qs[0]
        #print "expected", self.shared_q.eval()[0]
        #print self.predict_rewards_and_cost(self.shared_s.eval(),self.shared_q.eval())[0][0]

        #return self.train_model_shared()
        self.train_model(states, expected_Qs)
class NeuralNet:
    def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out):
        '''
        Initialize a NeuralNet

        @param input_shape: tuple or list of length 4 , (batch size, num input feature maps,
                             image height, image width)
        @param filter_shapes: list of 2 (for each conv layer) * 4 values (number of filters, num input feature maps,
                              filter height,filter width)
        @param strides: list of size 2, stride values for each hidden layer
        @param n_hidden: int, number of neurons in the all-to-all connected hidden layer
        @param n_out: int, number od nudes in output layer
        '''

        #create theano variables corresponding to input_batch (x) and output of the network (y)
        x = T.ftensor4('x')
        y = T.fmatrix('y')

        #first hidden layer is convolutional:
        self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0],
                                                     input_shape, strides[0])

        #second convolutional hidden layer: the size of input depends on the size of output from first layer
        #it is defined as (num_batches, num_input_feature_maps, height_of_input_maps, width_of_input_maps)
        second_conv_input_shape = [
            input_shape[0], filter_shapes[0][0],
            self.layer_hidden_conv1.feature_map_size,
            self.layer_hidden_conv1.feature_map_size
        ]
        self.layer_hidden_conv2 = ConvolutionalLayer(
            self.layer_hidden_conv1.output,
            filter_shapes[1],
            image_shape=second_conv_input_shape,
            stride=2)

        #output from convolutional layer is 4D, but normal hidden layer expects 2D. Because of all to all connections
        # 3rd hidden layer does not care from which feature map or from which position the input comes from
        flattened_input = self.layer_hidden_conv2.output.flatten(2)

        #create third hidden layer
        self.layer_hidden3 = HiddenLayer(flattened_input,
                                         self.layer_hidden_conv2.fan_out,
                                         n_hidden)

        #create output layer
        self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden,
                                        n_out)

        #define the ensemble of parameters of the whole network
        self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \
            + self.layer_hidden3.params + self.layer_output.params

        #discount factor
        self.gamma = 0.95

        #: define regularization terms, for some reason we only take in count the weights, not biases)
        #  linear regularization term, useful for having many weights zero
        self.l1 = abs(self.layer_hidden_conv1.W).sum() \
            + abs(self.layer_hidden_conv2.W).sum() \
            + abs(self.layer_hidden3.W).sum() \
            + abs(self.layer_output.W).sum()

        #: square regularization term, useful for forcing small weights
        self.l2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \
            + (self.layer_hidden_conv2.W ** 2).sum() \
            + (self.layer_hidden3.W ** 2).sum() \
            + (self.layer_output.W ** 2).sum()

        #: define the cost function
        cost = 0.0 * self.l1 + 0.0 * self.l2_sqr + self.layer_output.errors(y)

        #: define gradient calculation
        grads = T.grad(cost, self.params)

        #: Define how much we need to change the parameter values
        learning_rate = 0.0001
        updates = []
        for param_i, gparam_i in zip(self.params, grads):
            updates.append((param_i, param_i - learning_rate * gparam_i))

        #: we need another set of theano variables (other than x and y) to use in train and predict functions
        temp_x = T.ftensor4('temp_x')
        temp_y = T.fmatrix('temp_y')

        #: define the training operation as applying the updates calculated given temp_x and temp_y
        self.train_model = theano.function(inputs=[temp_x, temp_y],
                                           outputs=[cost],
                                           updates=updates,
                                           givens={
                                               x: temp_x,
                                               y: temp_y
                                           })

        self.predict_rewards = theano.function(
            inputs=[temp_x],
            outputs=[self.layer_output.output],
            givens={x: temp_x})

        self.predict_rewards_and_cost = theano.function(
            inputs=[temp_x, temp_y],
            outputs=[self.layer_output.output, cost],
            givens={
                x: temp_x,
                y: temp_y
            })

    @profile
    def train(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: array of dictionaries, each dictionary contains
        one transition (prestate,action,reward,poststate)
        """

        #: we have a new, better estimation for the Q-val of the action we chose, it is the sum of the reward
        #  received on transition and the maximum of future rewards. Q-s for other actions remain the same.
        for i, transition in enumerate(minibatch):
            estimated_Q = self.predict_rewards([transition['prestate']])[0][0]
            estimated_Q[transition['action']] = transition['reward'] + self.gamma \
                                                * np.max(self.predict_rewards([transition['prestate']]))
            #: knowing what estimated_Q looks like, we can train the model
            self.train_model([transition['prestate']], [estimated_Q])

    @profile
    def predict_best_action(self, state):
        """
        Predict_best_action returns the action with the highest Q-value
        @param state: 4D array, input (game state) for which we want to know the best action
        """
        predicted_values_for_actions = self.predict_rewards(state)[0][0]
        return np.argmax(predicted_values_for_actions)