Exemplo n.º 1
0
class DeepQ:
    def __init__(self, environment, inputs):
        self.environment = environment
        self.state_size = inputs
        self.nr_actions = environment.action_space.n
        self.memory = Memory(30000)
        self.discountFactor = 0.975
        self.predictionModels = []
   
    def initImaginationNetworks(self):
        for t in xrange(self.nr_actions):
            self.predictionModels.insert(t, self.createModel(self.state_size, self.state_size, [self.state_size, self.state_size, self.state_size], "relu", 0.01))

    def initRewardNetwork(self):
        self.rewardModel = self.createModel(self.state_size, 1, [self.state_size, self.state_size, self.state_size], "relu", 0.01)

    def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate):
        model = Sequential()
        if len(hiddenLayers) == 0: 
            model.add(Dense(outputs, input_shape=(inputs,), init='lecun_uniform'))
            model.add(Activation("linear"))
        else :
            model.add(Dense(hiddenLayers[0], input_shape=(inputs,), init='lecun_uniform'))
            
            if (activationType == "LeakyReLU") :
                model.add(LeakyReLU(alpha=0.01))
            else :
                model.add(Activation(activationType))
            
            for index in range(1, len(hiddenLayers)-1):
                layerSize = hiddenLayers[index]
                model.add(Dense(layerSize, init='lecun_uniform'))
                if (activationType == "LeakyReLU") :
                    model.add(LeakyReLU(alpha=0.01))
                else :
                    model.add(Activation(activationType))
            model.add(Dense(outputs, init='lecun_uniform'))
            model.add(Activation("linear"))
        optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
        model.compile(loss="mse", optimizer=optimizer)
        return model

    def backupNetwork(self, model, backup):
        weightMatrix = []
        for layer in self.model.layers:
            weights = layer.get_weights()
            weightMatrix.append(weights)
        i = 0
        for layer in self.secondBrain.layers:
            weights = weightMatrix[i]
            layer.set_weights(weights)
            i += 1

    def getStatePrediction(self, state, action):
        predicted = self.predictionModels[action].predict(state.reshape(1,len(state)))
        return predicted[0]

    def getPredictedStates(self, state):
        predictedStates = []
        for a in xrange(self.nr_actions):
            predictedStates.insert(a, self.getStatePrediction(state, a))
        return predictedStates

    def getStateValuePrediction(self, state):
        predictedReward = self.rewardModel.predict(state.reshape(1,len(state)))
        return predictedReward[0][0]

    def getPredictedActionValues(self, state):
        predictedActionValues = []
        for a in xrange(self.nr_actions):
            predictedActionValues.insert(a, self.getStateValuePrediction(self.getStatePrediction(state, a)))
        return predictedActionValues

    def getMaxValue(self, array):
        return np.max(array)

    def getMaxIndex(self, array):
        return np.argmax(array)

    def getTarget(self, state, reward, isFinal):
        if isFinal:
            return reward
        else:
            predictedActionValues = self.getPredictedActionValues(state)
            # return reward + self.discountFactor * (sum(predictedActionValues)/len(predictedActionValues))
            return reward + self.discountFactor * np.max(predictedActionValues)


    def printStatePredictionTree(self, state):
        root = Tree()
        # first layer
        predicted1 = self.getPredictedStates(state)
        root.data = state
        root.left = Tree()
        root.left.data = predicted1[0]
        root.right = Tree()
        root.right.data = predicted1[1]

        # second layer
        predicted2left = self.getPredictedStates(predicted1[0])
        root.left.left = Tree()
        root.left.left.data = predicted2left[0]
        root.left.right = Tree()
        root.left.right.data = predicted2left[1]
        predicted2right = self.getPredictedStates(predicted1[1])
        root.right.left = Tree()
        root.right.left.data = predicted2right[0]
        root.right.right = Tree()
        root.right.right.data = predicted2right[1]

        print ""
        print "\t\t\t\t\t\t\t\t\t\t",root.data
        print "\t\t\t\t",root.left.data,"\t\t\t\t\t\t\t",root.right.data
        print root.left.left.data,"\t",root.left.right.data,"\t",root.right.left.data,"\t",root.right.right.data

    def printStateValueTree(self, state):
        root = Tree()
        # first layer
        predicted1 = self.getPredictedStates(state)
        root.data = state
        root.left = Tree()
        root.left.data = predicted1[0]
        root.right = Tree()
        root.right.data = predicted1[1]

        # second layer
        predicted2left = self.getPredictedStates(predicted1[0])
        root.left.left = Tree()
        root.left.left.data = predicted2left[0]
        root.left.right = Tree()
        root.left.right.data = predicted2left[1]
        predicted2right = self.getPredictedStates(predicted1[1])
        root.right.left = Tree()
        root.right.left.data = predicted2right[0]
        root.right.right = Tree()
        root.right.right.data = predicted2right[1]

        print ""
        print "\t\t\t\t\t\t\t\t\t\t",self.getStateValuePrediction(root.data)
        print "\t\t\t\t",self.getStateValuePrediction(root.left.data),"\t\t\t\t\t\t\t\t\t\t\t",self.getStateValuePrediction(root.right.data)
        print self.getStateValuePrediction(root.left.left.data),"\t\t\t\t\t",self.getStateValuePrediction(root.left.right.data),"\t\t\t\t\t",self.getStateValuePrediction(root.right.left.data),"\t\t\t\t\t",self.getStateValuePrediction(root.right.right.data)

    # select the action with the highest Q value
    def selectAction(self, state, explorationRate):
        rand = random.random()
        if rand < explorationRate :
            action = np.random.randint(0, self.nr_actions)
        else :
            action = self.getMaxIndex(self.getPredictedActionValues(state))
        return action

    def selectActionStepsForward(self, state, depth):
        root = Tree()
        # first layer
        predicted1 = self.getPredictedStates(state)
        leftMax = self.getStateValuePrediction(predicted1[0])
        rightMax = self.getStateValuePrediction(predicted1[1])

        predicted2left = self.getPredictedActionValues(predicted1[0])
        leftMax = max(leftMax, np.max(self.getStateValuePrediction(predicted1[0])))

        predicted2right = self.getPredictedStates(predicted1[1])
        rightMax = max(rightMax, np.max(self.getStateValuePrediction(predicted1[1])))

        if rightMax > leftMax:
            return 1
        else:
            return 0


    def addMemory(self, state, action, reward, newState, isFinal):
        self.memory.addMemory(state, action, reward, newState, isFinal)

    def trainStatePredictionOnLastState(self):
        X_batch = np.empty((0,self.state_size), dtype = np.float64)
        Y_batch = np.empty((0,self.state_size), dtype = np.float64)

        lastMemory = self.memory.getLastMemory()
        isFinal = lastMemory['isFinal']
        state = lastMemory['state']
        action = lastMemory['action']
        reward = lastMemory['reward']
        newState = lastMemory['newState']

        X_batch = np.append(X_batch, [state], axis=0)
        Y_batch = np.append(Y_batch, [newState], axis=0)

        self.predictionModels[action].fit(X_batch, Y_batch, batch_size = len(X_batch), verbose = 0)


    def trainStatePreditions(self, miniBatchSize): 
        X_batches = []
        Y_batches = []
        for t in xrange(self.nr_actions):
            X_batches.append(np.empty((0,self.state_size), dtype = np.float64))
            Y_batches.append(np.empty((0,self.state_size), dtype = np.float64))
        miniBatch = self.memory.getMiniBatch(miniBatchSize)
        for sample in miniBatch:
            isFinal = sample['isFinal']
            state = sample['state']
            action = sample['action']
            reward = sample['reward']
            newState = sample['newState']

            inputValues = state.copy()
            targetValues = newState.copy()

            X_batches[action] = np.append(X_batches[action], np.array([inputValues]), axis=0)
            Y_batches[action] = np.append(Y_batches[action], np.array([targetValues]), axis=0)

        for a in xrange(self.nr_actions):
            if len(X_batches[action]) > 0:
                self.predictionModels[action].fit(X_batches[action].reshape(len(X_batches[action]),4), Y_batches[action], batch_size = len(X_batches[action]), verbose = 0)

    def trainRewardModel(self, miniBatchSize): 
        miniBatch = self.memory.getMiniBatch(miniBatchSize)
        X_batch = np.empty((0,self.state_size), dtype = np.float64)
        Y_batch = np.empty((0,1), dtype = np.float64)
        for sample in miniBatch:
            isFinal = sample['isFinal']
            state = sample['state']
            action = sample['action']
            reward = sample['reward']
            newState = sample['newState']

            inputValues = newState.copy()
            targetValue = [self.getTarget(newState, reward, isFinal)]

            X_batch = np.append(X_batch, np.array([inputValues]), axis=0)
            Y_batch = np.append(Y_batch, [targetValue], axis=0)
        self.rewardModel.fit(X_batch, Y_batch, batch_size = len(miniBatch), verbose = 0)
Exemplo n.º 2
0
class APLDDPGAgent(AbstractAgent):

    name = "apl_ddpg"

    def __init__(self, env, iter=200000, *args, **kwargs):
        # create the actor model
        # create the critic model
        self.env = env
        self.action_dim = sum(
            sum(1 for i in row if i) for row in self.env.action_space.sample())
        self.observation = env.reset()
        self.state_dim = self.observation.shape
        print ">>>>>>>>>>>>>>>>>>>>>state dim " + str(self.state_dim)
        self.nn_action_dim = 6  # limit ddpg network output to 3 DOF
        self.noise = OUProcess(self.nn_action_dim,
                               mu=OU_MEAN,
                               theta=OU_THETA,
                               sigma=EPSILON_RANGE[0])

    def fit(self, *args, **kwargs):

        MEM_SZ = MEM_SIZE_FCL

        sess = K.get_session()
        K.set_learning_phase(1)

        self.actor = ActorNetwork(sess,
                                  self.state_dim,
                                  self.nn_action_dim,
                                  BATCH_SIZE,
                                  TAU,
                                  LRA,
                                  convolutional=CONVOLUTIONAL,
                                  output_activation=ACTION_ACTIVATION)
        self.critic = CriticNetwork(sess,
                                    self.state_dim,
                                    self.nn_action_dim,
                                    BATCH_SIZE,
                                    TAU,
                                    LRC,
                                    convolutional=CONVOLUTIONAL)

        self.memory = Memory(MEM_SZ)

        self.actor.target_model.summary()
        self.critic.target_model.summary()

        if LOAD_WEIGHTS:
            self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                          "actor_model_" +
                                          LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                           "critic_model_" +
                                           LOAD_WEIGHTS_EPISODE + ".h5")
            self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                 "actor_target_model_" +
                                                 LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                  "critic_target_model_" +
                                                  LOAD_WEIGHTS_EPISODE + ".h5")
            print("Weights Loaded!")

        #====================================================
        #Initialize noise processes
        #self.noise_procs = []
        #for i in range(NUM_NOISE_PROCS):
        #    self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV))

        #====================================================

        PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS
        steps = STARTING_EPISODE * EPISODE_LENGTH
        start_time = time.time()
        last_ep_time = time.time()
        if MAKE_PLOT:
            reward_graph = Grapher()

        for ep in range(STARTING_EPISODE, EPISODES):

            #reset noise processes
            #for ou in self.noise_procs:
            #    ou.reset()

            self.noise.reset()

            #start time counter
            if (ep == PRE_LEARNING_EPISODES):
                start_time = time.time()

            print("Episode: " + str(ep) + "  Frames: " +
                  str(ep * EPISODE_LENGTH) + "  Uptime: " + str(
                      (time.time() - start_time) / 3600.0) +
                  " hrs    ===========")

            state = self.env.reset()

            play_only = (ep % 10 == 0)

            total_reward = 0

            if play_only or ALREADY_TRAINED:
                for step in range(TEST_EPISODE_LENGTH):

                    #print ">>>>>>>>>>>>>", state.shape
                    #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center
                    #img = np.multiply(img, 1.0/128.0) #scale [-1,1]
                    #img = np.transpose(state, (1,2,0))

                    #img = np.array(state)
                    #img = np.transpose(img, (1,2,0))

                    #print ">>>>>>>>>>>>>", state.shape

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(
                        state, can_be_random=False, use_target=True)

                    nstate, reward, done, info = self.env.step(control_action)
                    total_reward += reward
                    state = nstate
            else:
                for step in range(EPISODE_LENGTH):

                    # ACT ==============================
                    epsilon = (float(steps) / float(EPSILON_STEPS)) * (
                        EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0]

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(state,
                                                               epsilon=epsilon)
                    new_state, reward, done, info = self.env.step(
                        control_action)
                    done = done or (step >= EPISODE_LENGTH)
                    self.memory.addMemory(state, action, reward, new_state,
                                          done)
                    state = new_state

                    # LEARN ============================
                    if ep > PRE_LEARNING_EPISODES:
                        batch, idxs = self.memory.getMiniBatch(BATCH_SIZE)
                        self.learnFromBatch(batch)

                    if done:
                        break
                    # CLEANUP ==========================
                    steps += 1

            #we need to consider the episodes without noise to actually tell how the system is doing
            if play_only and MAKE_PLOT:
                reward_graph.addSample(total_reward)
                reward_graph.displayPlot()

            #calculate fph on total frames
            total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH
            elapsed = time.time() - start_time
            fps = total_frames / elapsed
            fph = fps * 3600.0

            #re-calculate fps on this episode, so it updates quickly
            fps = EPISODE_LENGTH / (time.time() - last_ep_time)
            last_ep_time = time.time()
            print("fps: " + str(fps) + "  fph: " + str(fph) + "\n")

            #save plot and weights
            if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY
                    == 0) and not ALREADY_TRAINED:

                #plot
                if MAKE_PLOT:
                    reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" +
                                          str(ep) + ".jpg")

                #weights
                self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX +
                                              "actor_model_" + str(ep) + ".h5",
                                              overwrite=True)
                self.actor.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)
                self.critic.model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5",
                    overwrite=True)
                self.critic.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)

                #network structures (although I don't think I ever actually use these)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.target_model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_target_model_" +
                        str(ep) + ".json", "w") as outfile:
                    json.dump(self.critic.target_model.to_json(), outfile)

    def learnFromBatch(self, miniBatch):

        dones = np.asarray([sample['isFinal'] for sample in miniBatch])
        states = np.asarray([sample['state'] for sample in miniBatch])
        actions = np.asarray([sample['action'] for sample in miniBatch])
        new_states = np.asarray([sample['newState'] for sample in miniBatch])
        Y_batch = np.asarray([sample['reward'] for sample in miniBatch])

        new_states = np.reshape(new_states, new_states.shape + (1, ))

        target_q_values = self.critic.target_model.predict(
            [new_states,
             self.actor.target_model.predict(new_states)])

        for i in range(len(miniBatch)):
            if not dones[i]:
                Y_batch[i] = Y_batch[i] + GAMMA * target_q_values[i]

        self.critic.model.train_on_batch([states, actions], Y_batch)

        #additional operations to train actor
        temp_actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, temp_actions)
        self.actor.train(states, grads)

        #update target networks
        self.actor.target_train()
        self.critic.target_train()

    ''' This is wrong I think
    def OU(x, mu, theta, sigma):
        return theta * (mu - x) + sigma * np.random.randn(1)
    '''

    def clip(self, x, minx, maxx):
        return max(minx, min(maxx, x))

    def selectAction(self,
                     state,
                     can_be_random=True,
                     use_target=False,
                     epsilon=1.0,
                     permutation_num=0):
        state = np.array([state])  #add dimension to make a "batch" of 1

        if use_target:
            actions = self.actor.target_model.predict(state)
        else:
            actions = self.actor.model.predict(state)

        actions = np.squeeze(actions)

        #print control_actions

        #print("+++++++++++")
        #print(actions)

        if can_be_random:
            self.noise.sigma = epsilon
            noise = self.noise.noise()
            #print noise

            i = 0
            for idx, a in enumerate(actions):
                actions[i] = actions[i] + noise[i]
                actions[i] = self.clip(
                    actions[i], -3.14,
                    3.14)  #need to assign to actions[i], not just a.
                i += 1

            #get noise
            #noise = []
            #iterate over all noise procs for non-coop, or a single agent's procs for co-op
            #for n in range(permutation_num*ACTIONS_PER_AGENT, permutation_num*ACTIONS_PER_AGENT + self.action_dim):
            #    ou = self.noise_procs[n]
            #    noise.append(ou.step())

#            for idx, a in enumerate(actions):
#                ou = self.noise_procs[0]
#                noise = ou.step()
#                a = a + epsilon*noise
#                #print epsilon * noise
#                actions[i] = self.clip(a, -3.14, 3.14) #need to assign to actions[i], not just a.
#                i += 1
#
#print(actions)

#fill in zeros for all non-learned outputs
        control_actions = np.pad(actions, (0, self.action_dim - len(actions)),
                                 'constant')
        #print actions
        #print control_actions

        return actions, control_actions

    #Constructs an image from state vector
    def constructImageRepresentation(self, state):
        img = np.empty([IMAGE_SIDE_LENGTH, IMAGE_SIDE_LENGTH], dtype=np.uint8)
        img.fill(128)

        color = 255
        delta_color = int(math.floor(128 / NUM_TARGETS))
        for j in range(NUM_TARGETS):
            tar = [state[2 * j], state[2 * j + 1]]
            cv2.circle(img, (int(
                tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)),
                       5, 0, -1)
            cv2.circle(img, (int(
                tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)),
                       4, color, -1)
            color -= delta_color

        color = 0
        for j in range(NUM_AGENTS):
            offset = 2 * NUM_TARGETS
            agent = [state[offset + 2 * j], state[offset + 2 * j + 1]]
            #draw blank agent, no thrust display
            cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 4,
                                int(agent[1] * IMAGE_SIDE_LENGTH) - 1),
                          (int(agent[0] * IMAGE_SIDE_LENGTH) + 4,
                           int(agent[1] * IMAGE_SIDE_LENGTH) + 1), color, -1)
            cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 1,
                                int(agent[1] * IMAGE_SIDE_LENGTH) - 4),
                          (int(agent[0] * IMAGE_SIDE_LENGTH) + 1,
                           int(agent[1] * IMAGE_SIDE_LENGTH) + 4), color, -1)
            #first agent ia 0 since we control it, others are same color
            color = 64
        '''
        cv2.namedWindow('perm_image',cv2.WINDOW_NORMAL)
        cv2.resizeWindow('perm_image', 600,600)
        cv2.imshow('perm_image', img)
        cv2.waitKey(1)
        '''

        img = np.array([np.subtract(img, 128)], dtype=np.float32)  #zero center
        img = np.multiply(img, 1.0 / 128.0)  #scale [-1,1]
        img = np.transpose(img, (1, 2, 0))

        return img

    #for co-op case, get an arrangement of the state vector for each agent.
    def getStatePermutations(self, state):
        perms = []
        for i in range(NUM_AGENTS):

            if CONVOLUTIONAL and not DRAW_STATE:
                perms.append(state)
            else:
                pstate = []

                #copy over target data
                for j in range(NUM_TARGETS * 2):
                    pstate.append(state[j])

                #copy agent data, rotated
                for j in range(NUM_AGENTS * 2):
                    rot_j = (j +
                             (i * 2)) % (NUM_AGENTS * 2) + (NUM_TARGETS * 2)
                    pstate.append(state[rot_j])

                if DRAW_STATE:
                    perms.append(constructImageRepresentation(pstate))
                else:
                    perms.append(np.asarray(pstate, dtype=np.float32))

        return perms
Exemplo n.º 3
0
class dqn:
    def __init__(self, learning_rate, minibatch_size, gamma, state_space,
                 action_space):
        self.state_space = state_space
        self.action_space = action_space
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.minibatch_size = minibatch_size
        self.replay_memory_size = 10000
        self.experience_buffer = Memory(self.replay_memory_size)

        self.init_network()

        tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self.current_loss = 0.0

    def init_network(self):
        tf.reset_default_graph()
        '''
            Create eval q networks
        '''
        # inputs is the observation
        self.s = tf.placeholder(dtype=tf.float32,
                                shape=[None, self.state_space],
                                name="observation")

        # fully connected layer layer 1
        w_fc1 = tf.Variable(
            tf.truncated_normal([self.state_space, 1024], stddev=0.01))
        b_fc1 = tf.Variable(tf.zeros([1024]))
        layer1 = tf.nn.relu(tf.matmul(self.s, w_fc1) + b_fc1)

        # fully connected layer layer 2
        w_out = tf.Variable(
            tf.truncated_normal([1024, self.action_space], stddev=0.01))
        b_out = tf.Variable(tf.zeros([self.action_space]))
        self.Qout = tf.matmul(layer1, w_out) + b_out

        # q value from next state
        self.Qout_next = tf.placeholder(tf.float32, [None, self.action_space])
        '''
            Loss Function
        '''

        self.loss = tf.reduce_mean(tf.square(self.Qout_next - self.Qout))

        optimizer = tf.train.RMSPropOptimizer(self.learning_rate)

        self.trainer = optimizer.minimize(self.loss)

        self.saver = tf.train.Saver()

        self.sess = tf.Session()

    def get_Q_values(self, state):
        return self.sess.run(self.Qout, feed_dict={self.s: [state]})[0]

    def store_experience(self, state, action, reward, nextState, done):
        self.experience_buffer.addMemory(state, action, reward, nextState,
                                         done)

    def replay_experience(self):
        if self.experience_buffer.getCurrentSize() > self.minibatch_size:
            state__miniBatch = []
            qout_miniBatch = []

            size = min(self.experience_buffer.getCurrentSize(),
                       self.minibatch_size)
            miniBatch = self.experience_buffer.getMiniBatch(size)

            for sample in miniBatch:
                done = sample['isFinal']
                state = sample['state']
                action = sample['action']
                reward = sample['reward']
                newState = sample['newState']

                qValues = self.get_Q_values(state)

                if done:
                    qValues[action] = reward
                else:
                    qValues[action] = reward + self.gamma * np.max(
                        self.get_Q_values(newState))

                state__miniBatch.append(state)
                qout_miniBatch.append(qValues)

            #train
            self.sess.run(self.trainer,
                          feed_dict={
                              self.s: state__miniBatch,
                              self.Qout_next: qout_miniBatch
                          })

            self.current_loss = self.sess.run(self.loss,
                                              feed_dict={
                                                  self.s: state__miniBatch,
                                                  self.Qout_next:
                                                  qout_miniBatch
                                              })

    def load_model(self, model_path=None):
        if model_path:
            # load from saved file
            self.saver.restore(self.sess, model_path)
        else:
            # load from checkpoint
            checkpoint = tf.train.get_checkpoint_state(
                '/home/kin/python/q_learning/checkpoint')
            if checkpoint and checkpoint.model_checkpoint_path:
                self.saver.restore(self.sess, checkpoint.model_checkpoint_path)

    def save_model(self):
        self.saver.save(self.sess,
                        '/home/kin/python/q_learning/saved_model.ckpt')
Exemplo n.º 4
0
class DeepQ:
    def __init__(self, inputs, outputs, memorySize, discountFactor, learningRate, learnStart):
        self.input_size = inputs
        self.output_size = outputs
        self.memory = Memory(memorySize)
        self.discountFactor = discountFactor
        self.learnStart = learnStart
        self.learningRate = learningRate
    
    def initNetwork(self, hiddenLayers):
        model = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate)
        self.model = model
        targetModel = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate)
        self.targetModel = targetModel

    def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate):
        bias = True
        dropout = 0
        regularizationFactor = 0.01
        model = tf.keras.models.Sequential()
        if len(hiddenLayers) == 0:
            model.add(tf.keras.layers.Dense(self.input_size, input_shpae =(self.input_size,), kernel_initializer = 'lecun_uniform', use_bias = bias))
            model.add(tf.keras.layers.Activation("linear"))
        else:
            model.add(tf.keras.layers.Dense(hiddenLayers[0], input_shape=(self.input_size,), kernel_initializer = 'lecun_uniform', kernel_regularizer = tf.keras.regularizers.l2(l = regularizationFactor), use_bias = bias))
            if (activationType == "LeakyReLU"):
                model.add(tf.keras.layers.LeakyReLU(alpha = 0.01))
            else:
                model.add(tf.keras.layers.Activation(activationType))

            for index in range(1, len(hiddenLayers)-1):
                layerSize = hiddenLayers[index]
                model.add(tf.keras.layers.Dense(layerSize, kernel_initializer= 'lecun_uniform', kernel_regularizer = tf.keras.regularizers.l2(l = regularizationFactor), use_bias = bias))
                if dropout > 0:
                    model.add(tf.keras.layers.Dropout(dropout))
                if (activationType == "LeakyReLU"):
                    model.add(tf.keras.layers.LeakyReLU(aplha = 0.01))
                else:
                    model.add(tf.keras.layers.Activation(activationType))
            model.add(tf.keras.layers.Dense(self.output_size, kernel_initializer='lecun_uniform', use_bias=bias))
            model.add(tf.keras.layers.Activation("linear"))
        optimizer = tf.keras.optimizers.RMSprop(lr=learningRate, rho = 0.9, epsilon = 1e-06)
        model.compile(loss="mse", optimizer = optimizer)
        return model

    def printNetwork(self):
        i = 0
        for layer in self.model.layers:
            weight = layer.get_weights()
            print("layer ",i," : ",weight)
            i+=1
    
    # copy current network to backup (traget) model
    def backupNetwork(self, model, backup):
        weightMatrix = []
        for layer in model.layers:
            weights = layer.get_weights()
            weightMatrix.append(weights)
        i = 0
        for layer in backup.layers:
            weights = weightMatrix[i]
            layer.set_weights(weights)
            i+=1
    
    def updateTargetNetwork(self):
        self.backupNetwork(self.model, self.targetModel)
    
    def getQValues(self, state):
        predicted = self.targetModel.predict(state.reshape(1, len(state)))
        return predicted[0]
    
    def getMaxQ(self, qValues):
        return np.max(qValues)
    
    def getMaxIndex(self, qValues):
        return np.argmax(qValues)
    
    #calculate the traget fucntion
    def calculateTarget(self, qValuesNewState, reward, isFinal):
        if isFinal:
            return reward
        else:
            return reward + self.discountFactor  * self.getMaxQ(qValuesNewState)
        
    #select the action with the highest Q value
    def selectAction(self, qValues, explorationRate):
        rand = random.random()
        if rand  < explorationRate: 
            action = np.random.randint(0 , self.output_size)
        else:
            action = self.getMaxIndex(qValues)
        return action

    def selectionActionByProbability(self, qValues, bias):
        qValueSum = 0 
        shiftBy = 0 
        for value in qValues:
            if value + shiftBy < 0:
                shiftBy = - (value + shiftBy)
            shiftBy += 1e-06
        
        for value in qValues:
            qValueSum += (value + shiftBy) ** bias
    
        probabilitySum = 0
        qValueProbabilities = []
        for value in qValues:
            probability = ((value + shiftBy) ** bias) / float(qValueSum)
            qValueProbabilities.append(probability + probabilitySum)
            probabilitySum += probability
        qValueProbabilities[len(qValueProbabilities) - 1] = 1 
        rand = random.random()
        i = 0 
        for value in qValueProbabilities:
            if(rand<=value):
                return i 
            i+=1
        
    def addMemory(self, state, action, reward, newState, isFinal):
        self.memory.addMemory(state, action, reward, newState, isFinal)
    
    def learnOnLastState(self):
        if self.memory.getCurrentSize() >=1:
            return self.memory.getMemory(self.memory.getCurrentSize() - 1)
            
    def learnOnMiniBatch(self, miniBatchSize):
        if self.memory.getCurrentSize() > self.learnStart :
            miniBatch = self.memory.getMiniBatch(miniBatchSize)
            X_batch = np.empty((0,self.input_size), dtype = np.float64)
            Y_batch = np.empty((0,self.output_size), dtype = np.float64)
            for sample in miniBatch:
                isFinal = sample['isFinal']
                state = sample['state']
                action = sample['action']
                reward = sample['reward']
                newState = sample['newState']

                qValues = self.getQValues(state)
                qValuesNewState = self.getTargetQValues(newState)
                targetValue = self.calculateTarget(qValuesNewState, reward, isFinal)

                X_batch = np.append(X_batch, np.array([state.copy()]), axis=0)
                Y_sample = qValues.copy()
                Y_sample[action] = targetValue
                Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0)
                if isFinal:
                    X_batch = np.append(X_batch, np.array([newState.copy()]), axis=0)
                    Y_batch = np.append(Y_batch, np.array([[reward]*self.output_size]), axis=0)
            self.model.fit(X_batch, Y_batch, batch_size = len(miniBatch), nb_epoch=1, verbose = 0)

    def saveModel(self):
        model_json = self.model.to_json()
        with open("model.json","w") as json_file: 
            json_file.write(model_json)
        self.model.save_weights("model.h5")
        
        target_model_json = self.targetModel.to_json()
        with open("target_model.json","w") as json_file: 
            json_file.write(target_model_json)
        self.model.save_weights("target_model.h5")

    def loadModel(self):
        json_file = open("model.json","r")
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = tf.keras.models.model_from_json(loaded_model_json)
        loaded_model.load_weights("model.h5")
        self.model = loaded_model
        json_file = open("target_model.json","r")
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = tf.keras.models.model_from_json(loaded_model_json)
        loaded_model.load_weights("target_model.h5")
        self.targetModel = loaded_model
Exemplo n.º 5
0
class DeepQ:
    def __init__(self, outputs, memorySize, discountFactor, learningRate,
                 learnStart):
        """
        Parameters:
            - outputs: output size
            - memorySize: size of the memory that will store each state
            - discountFactor: the discount factor (gamma)
            - learningRate: learning rate
            - learnStart: steps to happen before for learning. Set to 128
        """
        self.output_size = outputs
        self.memory = Memory(memorySize)
        self.discountFactor = discountFactor
        self.learnStart = learnStart
        self.learningRate = learningRate

        self.model = self.createModel(True)
        self.targetModel = self.createModel()  #To add stability to training

    def createModel(self, record=False):
        model = Sequential()
        #normalize the image  to avoid saturation and make the gradients work better
        model.add(
            Lambda(lambda x: x / 127.5 - 1.0, input_shape=utils.INPUT_SHAPE)
        )  #127.5-1.0 = experimental value from udacity self driving car course
        #32 8x8 convolution kernels with 4x4 stride and activation function ReLU
        model.add(
            Conv2D(32,
                   8,
                   strides=4,
                   activation="relu",
                   kernel_initializer='lecun_uniform'))
        model.add(
            Conv2D(64,
                   4,
                   strides=2,
                   activation="relu",
                   kernel_initializer='lecun_uniform'))
        model.add(
            Conv2D(64,
                   3,
                   strides=1,
                   activation="relu",
                   kernel_initializer='lecun_uniform'))
        model.add(Flatten())
        model.add(
            Dense(512, activation="relu", kernel_initializer='lecun_uniform'))
        model.add(Dense(
            3, activation="linear"))  # 3 outputs for the 3 different actions

        optimizer = optimizers.RMSprop(lr=self.learningRate,
                                       rho=0.9,
                                       epsilon=1e-06)  # From deepq.py
        model.compile(loss="mean_squared_error",
                      optimizer=optimizers.Adam(self.learningRate))
        #Try: optimizer=optimizers.Adam(self.learningRate)

        if record:
            timeStamp = time.time()
            path = os.path.dirname(
                os.path.realpath(__file__))  #get python file path
            self.tensorboard = TensorBoard(
                log_dir="{}/logs/{}".format(path, timeStamp))
            print("Run `tensorboard --logdir={}/logs/{}` to see CNN status".
                  format(path, timeStamp))
            model.summary()

        return model

    #In order to have a stable training session we must back up a target network so that we can use it to provide a consistent policy at training time
    def backupNetwork(self, model, backup):
        weightMatrix = []
        for layer in model.layers:
            weights = layer.get_weights()
            weightMatrix.append(weights)
        i = 0
        for layer in backup.layers:
            weights = weightMatrix[i]
            layer.set_weights(weights)
            i += 1

    def updateTargetNetwork(self):
        self.backupNetwork(self.model, self.targetModel)
        print("Taget model updated")

    #train the network to approximate the bellman equation `r + ymax2a'Q(s',a')`
    #use miniBatch / Experience Replay
    def learn(self, size):
        #X = numpy list of arrays of input data
        #Y = numpy list of arrays of target data
        # Batch size = samples per gradient udpate
        # Do not learn until we've got self.learnStart samples
        if self.memory.getCurrentSize() > self.learnStart:
            # learn in batches of 128
            batch = self.memory.getMiniBatch(size)
            X_batch = np.empty((0, utils.INPUT_SHAPE[0], utils.INPUT_SHAPE[1],
                                utils.INPUT_SHAPE[2]),
                               dtype=np.float64)
            Y_batch = np.empty((0, self.output_size), dtype=np.float64)
            for sample in batch:
                state = sample['state']
                qValues = self.getQValues(state)  #model predicted Q(s,a)
                qTargetValues = self.getTargetQValues(
                    sample['newState'])  #model predicted Q'(s',a')
                targetValue = self.calculateTarget(
                    qTargetValues, sample['reward'],
                    sample['isFinal'])  #est. bellman equation

                X_batch = np.append(
                    X_batch, np.array(state.copy()), axis=0
                )  #inuput states with corresponding actions w/ rewards for training
                # We are teaching the network to predict to the discounted reward of taking the optimal action at state s
                Y_sample = qValues.copy()
                Y_sample[0][sample['action']] = targetValue
                # Every action should be Q(s,a) except for the action taken so that the error on the other action stays 0
                Y_batch = np.append(Y_batch, np.array(Y_sample), axis=0)
                # X provides the state to feed into the network to calc error based on Y

                #Not sure why this exists???????????????????????
                if sample["isFinal"]:
                    X_batch = np.append(X_batch,
                                        np.array(newState.copy()),
                                        axis=0)  #Why use new state?
                    #instead of appending discounted reward from bellman equation use final reward
                    Y_batch = np.append(Y_batch,
                                        np.full((1, 3), sample['reward']),
                                        axis=0)  # 3 = number of output neurons

            history = self.model.fit(X_batch,
                                     Y_batch,
                                     batch_size=len(batch),
                                     epochs=1,
                                     verbose=0,
                                     callbacks=[self.tensorboard])
            print("Loss: " + str(history.history['loss']))
            #monitor progress via tensorboard --logdir=logs/hal

    # predict Q values for all the actions
    def getQValues(self, state):
        predicted = self.model.predict(state)
        return predicted

    def getTargetQValues(self, state):
        predicted = self.targetModel.predict(state)
        return predicted

    def saveModel(self, filepath):
        self.model.save(filepath)

    def loadModel(self, filepath):
        self.model = load_model(filepath)

    def loadWeights(self, filepath):
        self.model.set_weights(load_model(filepath).get_weights())

    def getMaxQ(self, qValues):
        return np.argmax(qValues)

    # calculate the target function
    def calculateTarget(self, qValuesNewState, reward, isFinal):
        """
        Target = reward(s,a) + gamma * max(Q(s'))
        Bellman equation
        """
        if isFinal:
            return reward
        else:
            return reward + self.discountFactor * self.getMaxQ(qValuesNewState)
            #`self.discountFactor * self.getMaxQ(qValuesNewState)` is an approximation but will improve as the network is trained

    # select the action with the highest Q value
    def selectAction(self, qValues, explorationRate):  #rate from 0-1
        rand = random.random()
        if rand < explorationRate:
            action = np.random.randint(0, self.output_size)
        else:
            action = self.getMaxQ(qValues)
        return action

    def addMemory(self, state, action, reward, newState, isFinal):
        self.memory.addMemory(state, action, reward, newState, isFinal)
Exemplo n.º 6
0
class DeepQ:
    def __init__(self, environment, inputs):
        self.input_size = inputs
        self.output_size = environment.action_space.n
        self.memory = Memory(2000)
        self.discountFactor = 0.975
        self.learnStart = 36
        self.models = [None] * 5
   
    def initNetwork(self, hiddenLayers):
        model = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", 0.01)
        self.models[0] = model

        model2 = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", 0.01)
        self.models[1] = model2

        model3 = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", 0.01)
        self.models[2] = model3


    def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate):
        model = Sequential()
        if len(hiddenLayers) == 0: 
            model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform'))
            model.add(Activation("linear"))
        else :
            model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform'))
            
            if (activationType == "LeakyReLU") :
                model.add(LeakyReLU(alpha=0.01))
            else :
                model.add(Activation(activationType))
            
            for index in range(1, len(hiddenLayers)-1):
                layerSize = hiddenLayers[index]
                model.add(Dense(layerSize, init='lecun_uniform'))
                if (activationType == "LeakyReLU") :
                    model.add(LeakyReLU(alpha=0.01))
                else :
                    model.add(Activation(activationType))
            model.add(Dense(self.output_size, init='lecun_uniform'))
            model.add(Activation("linear"))
        optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
        model.compile(loss="mse", optimizer=optimizer)
        return model

    def backupNetwork(self, model, backup):
        weightMatrix = []
        for layer in self.model.layers:
            weights = layer.get_weights()
            weightMatrix.append(weights)
        i = 0
        for layer in self.secondBrain.layers:
            weights = weightMatrix[i]
            layer.set_weights(weights)
            i += 1

    # predict Q values for all the actions
    def getQValues(self, state, modelNr=0):
        predicted = self.models[modelNr].predict(state.reshape(1,len(state)))
        return predicted[0]

    def getMaxQ(self, qValues=None):
        if (qValues is None):
            qValues = self.getQValues(state)
        return np.max(qValues)

    def getMaxIndex(self, qValues=None):
        if (qValues is None):
            qValues = self.getQValues(state)
        return np.argmax(qValues)

    # calculate the target function
    def calculateTarget(self, qValuesNewState, reward, isFinal):
        if isFinal:
            return reward
        else : 
            return reward + self.discountFactor * self.getMaxQ(qValuesNewState)

    # select the action with the highest Q value
    def selectAction(self, qValues, explorationRate):
        rand = random.random()
        if rand < explorationRate :
            action = np.random.randint(0, self.output_size)
        else :
            action = self.getMaxIndex(qValues)
        return action

    def selectActionMostConfident(self, qValues, qValues2, explorationRate):
        rand = random.random()
        if rand < explorationRate :
            action = np.random.randint(0, self.output_size)
        else :
            maxQ1 = self.getMaxQ(qValues)
            maxQ2 = self.getMaxQ(qValues2)
            if (abs(maxQ1) > abs(maxQ2)):
                action = self.getMaxIndex(qValues)
            else :
                action = self.getMaxIndex(qValues2)
        return action

    def selectActionAverage(self, qValues, qValues2, explorationRate):
        rand = random.random()
        if rand < explorationRate :
            action = np.random.randint(0, self.output_size)
        else :
            avgQValues = []
            for i in range(0, len(qValues)-1):
                value1 = qValues[i]
                value2 = qValues2[i]
                avg = (value1 + value2) / 2.0
                avgQValues.append(avg)
            action = self.getMaxIndex(avgQValues)
        return action

    def selectActionAdded(self, qValues, qValues2, explorationRate):
        rand = random.random()
        if rand < explorationRate :
            action = np.random.randint(0, self.output_size)
        else :
            addedQValues = qValues + qValues2
            action = self.getMaxIndex(addedQValues)
        return action

    def selectActionMostPreferred(self, qValues, qValues2, qValues3, explorationRate):
        rand = random.random()
        if rand < explorationRate :
            action = np.random.randint(0, self.output_size)
        else :
            action1 = self.getMaxIndex(qValues)
            action2 = self.getMaxIndex(qValues2)
            action3 = self.getMaxIndex(qValues3)
            actionsChosen = [0, 0]
            actionsChosen[action1] += 1
            actionsChosen[action2] += 1
            actionsChosen[action3] += 1
            if (actionsChosen[0] > actionsChosen[1]):
                action = 0
            else :
                action = 1
        return action

    def selectActionByProbability(self, qValues, bias):
        qValueSum = 0
        shiftBy = 0
        for value in qValues:
            if value + shiftBy < 0:
                shiftBy = - (value + shiftBy)
        shiftBy += 1e-06

        for value in qValues:
            qValueSum += (value + shiftBy) ** bias

        probabilitySum = 0
        qValueProbabilities = []
        for value in qValues:
            probability = ((value + shiftBy) ** bias) / float(qValueSum)
            qValueProbabilities.append(probability + probabilitySum)
            probabilitySum += probability
        qValueProbabilities[len(qValueProbabilities) - 1] = 1

        rand = random.random()
        i = 0
        for value in qValueProbabilities:
            if (rand <= value):
                return i
            i += 1

    def addMemory(self, state, action, reward, newState, isFinal):
        self.memory.addMemory(state, action, reward, newState, isFinal)

    def learnOnLastState(self):
        if self.memory.getCurrentSize() >= 1:
            return self.memory.getMemory(self.memory.getCurrentSize() - 1)

    def learnOnMiniBatch(self, miniBatchSize, modelNr=0): 
        if self.memory.getCurrentSize() > self.learnStart :
            miniBatch = self.memory.getMiniBatch(miniBatchSize)
            X_batch = np.empty((0,self.input_size), dtype = np.float64)
            Y_batch = np.empty((0,self.output_size), dtype = np.float64)
            for sample in miniBatch:
                isFinal = sample['isFinal']
                state = sample['state']
                action = sample['action']
                reward = sample['reward']
                newState = sample['newState']

                qValues = self.getQValues(state)
                qValuesNewState = self.getQValues(newState)
                targetValue = self.calculateTarget(qValuesNewState, reward, isFinal)

                X_batch = np.append(X_batch, np.array([state]), axis=0)
                Y_sample = qValues.copy()
                Y_sample[action] = targetValue
                Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0)
            self.models[modelNr].fit(X_batch, Y_batch, batch_size = 1, verbose = 0)
Exemplo n.º 7
0
class DeepQ:
    def __init__(self, size_state, nr_actions, memorySize, discountFactor, learningRate, learnStart):
        self.input_size = size_state
        self.output_size = nr_actions
        self.memory = Memory(memorySize)
        self.discountFactor = discountFactor
        self.learnStart = learnStart
        self.learningRate = learningRate
   
    def initNetworks(self, hiddenLayers):
        model = self.createModel(self.input_size, self.output_size * , hiddenLayers, "relu", self.learningRate)
        self.model = model

        targetModel = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate)
        self.targetModel = targetModel

    def createRegularizedModel(self, inputs, outputs, hiddenLayers, activationType, learningRate):
        bias = True
        dropout = 0
        regularizationFactor = 0.01
        model = Sequential()
        if len(hiddenLayers) == 0: 
            model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform', bias=bias))
            model.add(Activation("linear"))
        else :
            if regularizationFactor > 0:
                model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform', W_regularizer=l2(regularizationFactor),  bias=bias))
            else:
                model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform', bias=bias))

            if (activationType == "LeakyReLU") :
                model.add(LeakyReLU(alpha=0.01))
            else :
                model.add(Activation(activationType))
            
            for index in range(1, len(hiddenLayers)-1):
                layerSize = hiddenLayers[index]
                if regularizationFactor > 0:
                    model.add(Dense(layerSize, init='lecun_uniform', W_regularizer=l2(regularizationFactor), bias=bias))
                else:
                    model.add(Dense(layerSize, init='lecun_uniform', bias=bias))
                if (activationType == "LeakyReLU") :
                    model.add(LeakyReLU(alpha=0.01))
                else :
                    model.add(Activation(activationType))
                if dropout > 0:
                    model.add(Dropout(dropout))
            model.add(Dense(self.output_size, init='lecun_uniform', bias=bias))
            model.add(Activation("linear"))
        optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
        model.compile(loss="mse", optimizer=optimizer)
        return model

    def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate):
        model = Sequential()
        if len(hiddenLayers) == 0: 
            model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform'))
            model.add(Activation("linear"))
        else :
            model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform'))
            if (activationType == "LeakyReLU") :
                model.add(LeakyReLU(alpha=0.01))
            else :
                model.add(Activation(activationType))
            
            for index in range(1, len(hiddenLayers)-1):
                layerSize = hiddenLayers[index]
                model.add(Dense(layerSize, init='lecun_uniform'))
                if (activationType == "LeakyReLU") :
                    model.add(LeakyReLU(alpha=0.01))
                else :
                    model.add(Activation(activationType))
            model.add(Dense(self.output_size, init='lecun_uniform'))
            model.add(Activation("linear"))
        optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
        model.compile(loss="mse", optimizer=optimizer)
        return model

    def printNetwork(self):
        i = 0
        for layer in self.model.layers:
            weights = layer.get_weights()
            print "layer ",i,": ",weights
            i += 1


    def backupNetwork(self, model, backup):
        weightMatrix = []
        for layer in model.layers:
            weights = layer.get_weights()
            weightMatrix.append(weights)
        i = 0
        for layer in backup.layers:
            weights = weightMatrix[i]
            layer.set_weights(weights)
            i += 1

    def updateTargetNetwork(self):
        self.backupNetwork(self.model, self.targetModel)

    # predict Q values for all the actions
    def getQValues(self, state):
        predicted = self.model.predict(state.reshape(1,len(state)))
        return predicted[0]

    def getTargetQValues(self, state):
        predicted = self.targetModel.predict(state.reshape(1,len(state)))
        return predicted[0]

    def getMaxQ(self, qValues):
        return np.max(qValues)

    def getMaxIndex(self, qValues):
        return np.argmax(qValues)

    # calculate the target function
    def calculateTarget(self, qValuesNewState, reward, isFinal):
        if isFinal:
            return reward
        else : 
            return reward + self.discountFactor * self.getMaxQ(qValuesNewState)

    # select the action with the highest Q value
    def selectAction(self, qValues, explorationRate):
        rand = random.random()
        if rand < explorationRate :
            action = np.random.randint(0, self.output_size)
        else :
            action = self.getMaxIndex(qValues)
        return action

    def selectActionByProbability(self, qValues, bias):
        qValueSum = 0
        shiftBy = 0
        for value in qValues:
            if value + shiftBy < 0:
                shiftBy = - (value + shiftBy)
        shiftBy += 1e-06

        for value in qValues:
            qValueSum += (value + shiftBy) ** bias

        probabilitySum = 0
        qValueProbabilities = []
        for value in qValues:
            probability = ((value + shiftBy) ** bias) / float(qValueSum)
            qValueProbabilities.append(probability + probabilitySum)
            probabilitySum += probability
        qValueProbabilities[len(qValueProbabilities) - 1] = 1

        rand = random.random()
        i = 0
        for value in qValueProbabilities:
            if (rand <= value):
                return i
            i += 1

    def addMemory(self, state, action, reward, newState, isFinal):
        self.memory.addMemory(state, action, reward, newState, isFinal)

    def learnOnLastState(self):
        if self.memory.getCurrentSize() >= 1:
            return self.memory.getMemory(self.memory.getCurrentSize() - 1)

    def learnOnMiniBatch(self, miniBatchSize, useTargetNetwork=True):
        if self.memory.getCurrentSize() > self.learnStart :
            miniBatch = self.memory.getMiniBatch(miniBatchSize)
            X_batch = np.empty((0,self.input_size), dtype = np.float64)
            Y_batch = np.empty((0,self.output_size), dtype = np.float64)
            for sample in miniBatch:
                isFinal = sample['isFinal']
                state = sample['state']
                action = sample['action']
                reward = sample['reward']
                newState = sample['newState']

                qValues = self.getQValues(state)
                if useTargetNetwork:
                    qValuesNewState = self.getTargetQValues(newState)
                else :
                    qValuesNewState = self.getQValues(newState)
                targetValue = self.calculateTarget(qValuesNewState, reward, isFinal)

                X_batch = np.append(X_batch, np.array([state.copy()]), axis=0)
                Y_sample = qValues.copy()
                Y_sample[action] = targetValue
                Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0)
                # if isFinal:
                #     X_batch = np.append(X_batch, np.array([newState.copy()]), axis=0)
                #     Y_batch = np.append(Y_batch, np.array([[reward]*self.output_size]), axis=0)
            self.model.fit(X_batch, Y_batch, batch_size = len(miniBatch), nb_epoch=1, verbose = 0)