Exemplo n.º 1
0
    def test_replay_memory(self):

        parser = create_parser()
        params = parser.parse_args()

        replay_mem1 = ReplayMemory(params.replay_capacity, params.batch_size, 84, 84, "test", 10, False, './output')

        env = AtariGymEnvironment(display=False, game="Breakout-v0")
        s1, r1, d1 = env.new_game()
        s2, r2, d2 = env.act(0)

        replay_mem1.add(0, r2, s2, d2)
        replay_mem1.add(0, r2, s2, d2)
        replay_mem1.add(0, r2, s2, d2)

        print(replay_mem1.counter)
        print(replay_mem1.current)

        replay_mem1.save_memory()
        replay_mem2 = ReplayMemory(params.replay_capacity, params.batch_size, 84, 84, "test", 10, True, './output')

        print(replay_mem2.counter)
        print(replay_mem2.current)

        assert replay_mem2.counter == replay_mem1.counter
        assert replay_mem2.current == replay_mem1.current

        print(replay_mem1.num_examples())
        print(replay_mem2.num_examples())
Exemplo n.º 2
0
 def test_get_minibatch(self):                                     
     replay_memory = ReplayMemory(None,
                                      self.use_gpu_replay_mem,
                                      self.max_replay_memory, 
                                      self.train_batch_size,
                                      self.screen_history,
                                      self.screen_width,
                                      self.screen_height,
                                      self.minibatch_random,
                                      self.screen_order)
     
     for i in range(255):
         screen = np.zeros((self.screen_height, self.screen_width))
         screen.fill(i + 1)
         replay_memory.add(i + 1, 10 * (i + 1), screen, False)
     
         if i > self.train_batch_size + self.screen_history:
             prestates, actions, rewards, poststates, terminals = replay_memory.get_minibatch()
             for b in range(self.train_batch_size-1):
                 for h in range(self.screen_history-1):
                     self.assertTrue(prestates[b+1, 0, 0, h] < prestates[b, 0, 0, h])
                     self.assertTrue(prestates[b, 0, 0, h+1] > prestates[b, 0, 0, h])
Exemplo n.º 3
0
class TestBinaryHeap(unittest.TestCase):
    def setUp(self):
        self.heap = BinaryHeap()
        self.replayMemory = ReplayMemory(10, 32, 4, 84, 84)

    def test_Add(self):
        totalNo = 10
        for i in range(totalNo):
            state = np.zeros((84, 84), dtype=np.int)
            state.fill(i)
            td = i
            
            addedIndex = self.replayMemory.add(0, 0, state, 0)
            self.heap.add(addedIndex, td)
            
        for i in range(totalNo):
            topItem = self.heap.getTop()
            self.assertEqual(totalNo - i - 1, topItem[0])
            self.heap.remove(0)
Exemplo n.º 4
0
class Agent(BaseModel):
    def __init__(self, config, environment, sess):
        self.sess = sess
        self.weight_dir = 'weight'
        self.env = environment
        #self.history = History(self.config)
        model_dir = './Model/a.model'
        self.memory = ReplayMemory(model_dir)
        self.max_step = 100000
        self.RB_number = 20
        self.num_vehicle = len(self.env.vehicles)
        self.action_all_with_power = np.zeros(
            [self.num_vehicle, 3, 2], dtype='int32'
        )  # this is actions that taken by V2V links with power
        self.action_all_with_power_training = np.zeros(
            [20, 3, 2], dtype='int32'
        )  # this is actions that taken by V2V links with power
        self.reward = []
        self.learning_rate = 0.01
        self.learning_rate_minimum = 0.0001
        self.learning_rate_decay = 0.96
        self.learning_rate_decay_step = 500000
        self.target_q_update_step = 100
        self.discount = 0.5
        self.double_q = True
        self.build_dqn()
        self.V2V_number = 3 * len(
            self.env.vehicles
        )  # every vehicle need to communicate with 3 neighbors
        self.training = True
        #self.actions_all = np.zeros([len(self.env.vehicles),3], dtype = 'int32')
    def merge_action(self, idx, action):
        self.action_all_with_power[idx[0], idx[1], 0] = action % self.RB_number
        self.action_all_with_power[idx[0], idx[1],
                                   1] = int(np.floor(action / self.RB_number))

    def get_state(self, idx):
        # ===============
        #  Get State from the environment
        # =============
        vehicle_number = len(self.env.vehicles)
        V2V_channel = (self.env.V2V_channels_with_fastfading[
            idx[0], self.env.vehicles[idx[0]].destinations[idx[1]], :] -
                       80) / 60  #这是求了一次平均吗?
        V2I_channel = (self.env.V2I_channels_with_fastfading[idx[0], :] -
                       80) / 60
        V2V_interference = (-self.env.V2V_Interference_all[idx[0], idx[1], :] -
                            60) / 60  # 初次写代码,只保留这三个作为状态就行
        NeiSelection = np.zeros(self.RB_number)
        for i in range(3):
            for j in range(3):
                if self.training:
                    NeiSelection[self.action_all_with_power_training[
                        self.env.vehicles[idx[0]].neighbors[i], j, 0]] = 1
                else:
                    NeiSelection[self.action_all_with_power[
                        self.env.vehicles[idx[0]].neighbors[i], j, 0]] = 1

        for i in range(3):
            if i == idx[1]:
                continue
            if self.training:
                if self.action_all_with_power_training[idx[0], i, 0] >= 0:
                    NeiSelection[self.action_all_with_power_training[idx[0], i,
                                                                     0]] = 1
            else:
                if self.action_all_with_power[idx[0], i, 0] >= 0:
                    NeiSelection[self.action_all_with_power[idx[0], i, 0]] = 1
        time_remaining = np.asarray(
            [self.env.demand[idx[0], idx[1]] / self.env.demand_amount])
        load_remaining = np.asarray([
            self.env.individual_time_limit[idx[0], idx[1]] / self.env.V2V_limit
        ])
        #print('shapes', time_remaining.shape,load_remaining.shape)
        return np.concatenate(
            (V2I_channel, V2V_interference, V2V_channel, NeiSelection,
             time_remaining, load_remaining))  #,time_remaining))
        #return np.concatenate((V2I_channel, V2V_interference, V2V_channel, time_remaining, load_remaining))#,time_remaining))
    def predict(self, s_t, step, test_ep=False):
        # ==========================
        #  Select actions
        # ==========================
        ep = 1 / (step / 1000000 + 1)
        if random.random(
        ) < ep and test_ep == False:  # epsion to balance the exporation and exploition
            action = np.random.randint(60)
        else:
            action = self.q_action.eval({self.s_t: [s_t]})[0]
        return action

    def observe(self, prestate: object, state: object, reward: object,
                action: object) -> object:
        # -----------
        # Collect Data for Training
        # ---------
        self.memory.add(
            prestate, state, reward, action
        )  # add the state and the action and the reward to the memory
        #print(self.step)
        if self.step > 0:
            if self.step % 50 == 0:
                #print('Training')
                self.q_learning_mini_batch()  # training a mini batch
                #self.save_weight_to_pkl()
            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                #print("Update Target Q network:")
                self.update_target_q_network()

    def train(self):
        #12月2号,一直未能找到self.step自加1的源码位置
        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        # max_avg_ep_reward = 0
        # ep_reward, actions = [], []
        # mean_big = 0
        # number_big = 0
        # mean_not_big = 0
        # number_not_big = 0
        self.env.new_random_game(20)
        for self.step in (
                range(0, 40000)
        ):  # need more configuration  由于self.step是在这个大循环中用in range(0,N)来表达,所以循环结束以后,自动会加一!
            if self.step == 0:  # initialize set some varibles
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_reward, actions = [], []

            # prediction
            # action = self.predict(self.history.get())
            if (self.step % 2000 == 1):
                self.env.new_random_game(20)
            print(self.step)
            # state_old = self.get_state([0,0])
            #print("state", state_old)
            self.training = True
            for k in range(1):
                for i in range(len(self.env.vehicles)):
                    for j in range(3):
                        state_old = self.get_state([i, j])
                        action = self.predict(state_old, self.step)
                        #self.merge_action([i,j], action)
                        self.action_all_with_power_training[
                            i, j, 0] = action % self.RB_number
                        self.action_all_with_power_training[i, j, 1] = int(
                            np.floor(action / self.RB_number))
                        reward_train = self.env.act_for_training(
                            self.action_all_with_power_training, [i, j])
                        state_new = self.get_state([i, j])
                        #这里get到的newstate居然和该用户选择的动作和通信功率没有关系?
                        self.observe(state_old, state_new, reward_train,
                                     action)

            if (self.step % 2000
                    == 0) and (self.step > 0):  #暂时还没有看到self.step自加
                # testing
                self.training = False
                number_of_game = 10
                if (self.step % 10000 == 0) and (self.step > 0):
                    number_of_game = 50
                if (self.step == 38000):
                    number_of_game = 100
                V2I_Rate_list = np.zeros(number_of_game)
                Fail_percent_list = np.zeros(number_of_game)
                for game_idx in range(number_of_game):
                    self.env.new_random_game(self.num_vehicle)
                    test_sample = 200
                    Rate_list = []
                    print('test game idx:', game_idx)
                    for k in range(test_sample):
                        action_temp = self.action_all_with_power.copy()
                        for i in range(len(self.env.vehicles)):
                            self.action_all_with_power[i, :, 0] = -1
                            sorted_idx = np.argsort(
                                self.env.individual_time_limit[i, :])
                            for j in sorted_idx:
                                state_old = self.get_state([i, j])
                                action = self.predict(state_old, self.step,
                                                      True)
                                self.merge_action([i, j], action)
                            if i % (len(self.env.vehicles) / 10) == 1:
                                action_temp = self.action_all_with_power.copy()
                                reward, percent = self.env.act_asyn(
                                    action_temp)  #self.action_all)
                                Rate_list.append(np.sum(reward))
                        #print("actions", self.action_all_with_power)
                    V2I_Rate_list[game_idx] = np.mean(np.asarray(Rate_list))
                    Fail_percent_list[game_idx] = percent
                    #print("action is", self.action_all_with_power)
                    print('failure probability is, ', percent)
                    #print('action is that', action_temp[0,:])
                self.save_weight_to_pkl()
                print('The number of vehicle is ', len(self.env.vehicles))
                print('Mean of the V2I rate is that ', np.mean(V2I_Rate_list))
                print('Mean of Fail percent is that ',
                      np.mean(Fail_percent_list))
                #print('Test Reward is ', np.mean(test_result))

    def q_learning_mini_batch(self) -> object:

        # Training the DQN model
        # ------
        #s_t, action,reward, s_t_plus_1, terminal = self.memory.sample()
        s_t, s_t_plus_1, action, reward = self.memory.sample()
        #print()
        #print('samples:', s_t[0:10], s_t_plus_1[0:10], action[0:10], reward[0:10])
        t = time.time()
        if self.double_q:  #double Q learning
            pred_action = self.q_action.eval({self.s_t: s_t_plus_1})
            q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({
                self.target_s_t:
                s_t_plus_1,
                self.target_q_idx:
                [[idx, pred_a] for idx, pred_a in enumerate(pred_action)]
            })
            target_q_t = self.discount * q_t_plus_1_with_pred_action + reward
        else:
            q_t_plus_1 = self.target_q.eval({self.target_s_t: s_t_plus_1})
            max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
            target_q_t = self.discount * max_q_t_plus_1 + reward
        _, q_t, loss, w = self.sess.run(
            [self.optim, self.q, self.loss, self.w], {
                self.target_q_t: target_q_t,
                self.action: action,
                self.s_t: s_t,
                self.learning_rate_step: self.step
            })  # training the network

        print('loss is ', loss)
        self.total_loss += loss
        self.total_q += q_t.mean()
        self.update_count += 1

    def build_dqn(self):
        # --- Building the DQN -------
        self.w = {}
        self.t_w = {}

        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu
        n_hidden_1 = 500
        n_hidden_2 = 250
        n_hidden_3 = 120
        n_input = 82
        n_output = 60

        def encoder(x):
            weights = {
                'encoder_h1':
                tf.Variable(
                    tf.truncated_normal([n_input, n_hidden_1], stddev=0.1)),
                'encoder_h2':
                tf.Variable(
                    tf.truncated_normal([n_hidden_1, n_hidden_2], stddev=0.1)),
                'encoder_h3':
                tf.Variable(
                    tf.truncated_normal([n_hidden_2, n_hidden_3], stddev=0.1)),
                'encoder_h4':
                tf.Variable(
                    tf.truncated_normal([n_hidden_3, n_output], stddev=0.1)),
                'encoder_b1':
                tf.Variable(tf.truncated_normal([n_hidden_1], stddev=0.1)),
                'encoder_b2':
                tf.Variable(tf.truncated_normal([n_hidden_2], stddev=0.1)),
                'encoder_b3':
                tf.Variable(tf.truncated_normal([n_hidden_3], stddev=0.1)),
                'encoder_b4':
                tf.Variable(tf.truncated_normal([n_output], stddev=0.1)),
            }
            layer_1 = tf.nn.relu(
                tf.add(tf.matmul(x, weights['encoder_h1']),
                       weights['encoder_b1']))
            layer_2 = tf.nn.relu(
                tf.add(tf.matmul(layer_1, weights['encoder_h2']),
                       weights['encoder_b2']))
            layer_3 = tf.nn.relu(
                tf.add(tf.matmul(layer_2, weights['encoder_h3']),
                       weights['encoder_b3']))
            layer_4 = tf.nn.relu(
                tf.add(tf.matmul(layer_3, weights['encoder_h4']),
                       weights['encoder_b4']))
            return layer_4, weights

        with tf.variable_scope('prediction'):
            self.s_t = tf.placeholder('float32', [None, n_input])
            self.q, self.w = encoder(self.s_t)
            self.q_action = tf.argmax(
                self.q, dimension=1)  #self.q_action是所有Q值最大的哪一个对应的标号,那为什么要取[0]?
        with tf.variable_scope('target'):
            self.target_s_t = tf.placeholder('float32', [None, n_input])
            self.target_q, self.target_w = encoder(self.target_s_t)
            self.target_q_idx = tf.placeholder('int32', [None, None],
                                               'output_idx')
            self.target_q_with_idx = tf.gather_nd(self.target_q,
                                                  self.target_q_idx)
        with tf.variable_scope('pred_to_target'):
            self.t_w_input = {}
            self.t_w_assign_op = {}
            for name in self.w.keys():
                print('name in self w keys', name)
                self.t_w_input[name] = tf.placeholder(
                    'float32',
                    self.target_w[name].get_shape().as_list(),
                    name=name)
                self.t_w_assign_op[name] = self.target_w[name].assign(
                    self.t_w_input[name])

        def clipped_error(x):
            try:
                return tf.select(
                    tf.abs(x) < 1.0, 0.5 * tf.square(x),
                    tf.abs(x) - 0.5)
            except:
                return tf.where(
                    tf.abs(x) < 1.0, 0.5 * tf.square(x),
                    tf.abs(x) - 0.5)

        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder('float32',
                                             None,
                                             name='target_q_t')
            self.action = tf.placeholder('int32', None, name='action')
            action_one_hot = tf.one_hot(self.action,
                                        n_output,
                                        1.0,
                                        0.0,
                                        name='action_one_hot')
            q_acted = tf.reduce_sum(self.q * action_one_hot,
                                    reduction_indices=1,
                                    name='q_acted')
            self.delta = self.target_q_t - q_acted
            self.global_step = tf.Variable(0, trainable=False)
            self.loss = tf.reduce_mean(tf.square(self.delta), name='loss')
            self.learning_rate_step = tf.placeholder('int64',
                                                     None,
                                                     name='learning_rate_step')
            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                   momentum=0.95,
                                                   epsilon=0.01).minimize(
                                                       self.loss)

        tf.initialize_all_variables().run()
        self.update_target_q_network()

    def update_target_q_network(self):
        for name in self.w.keys():
            self.t_w_assign_op[name].eval(
                {self.t_w_input[name]: self.w[name].eval()})

    def save_weight_to_pkl(self):
        if not os.path.exists(self.weight_dir):
            os.makedirs(self.weight_dir)
        for name in self.w.keys():
            save_pkl(self.w[name].eval(),
                     os.path.join(self.weight_dir, "%s.pkl" % name))

    def load_weight_from_pkl(self):
        with tf.variable_scope('load_pred_from_pkl'):
            self.w_input = {}
            self.w_assign_op = {}
            for name in self.w.keys():
                self.w_input[name] = tf.placeholder('float32')
                self.w_assign_op[name] = self.w[name].assign(
                    self.w_input[name])
        for name in self.w.keys():
            self.w_assign_op[name].eval({
                self.w_input[name]:
                load_pkl(os.path.join(self.weight_dir, "%s.pkl" % name))
            })
        self.update_target_q_network()

    def play(self, n_step=100, n_episode=100, test_ep=None, render=False):
        number_of_game = 100
        V2I_Rate_list = np.zeros(number_of_game)
        Fail_percent_list = np.zeros(number_of_game)
        self.load_weight_from_pkl()
        self.training = False

        for game_idx in range(number_of_game):
            self.env.new_random_game(self.num_vehicle)
            test_sample = 200
            Rate_list = []
            print('test game idx:', game_idx)
            print('The number of vehicle is ', len(self.env.vehicles))
            time_left_list = []
            power_select_list_0 = []
            power_select_list_1 = []
            power_select_list_2 = []

            for k in range(test_sample):
                action_temp = self.action_all_with_power.copy()
                for i in range(len(self.env.vehicles)):
                    self.action_all_with_power[i, :, 0] = -1
                    sorted_idx = np.argsort(
                        self.env.individual_time_limit[i, :])
                    for j in sorted_idx:
                        state_old = self.get_state([i, j])
                        time_left_list.append(state_old[-1])
                        action = self.predict(state_old, 0, True)
                        '''
                        if state_old[-1] <=0:
                            continue
                        power_selection = int(np.floor(action/self.RB_number))
                        if power_selection == 0:
                            power_select_list_0.append(state_old[-1])

                        if power_selection == 1:
                            power_select_list_1.append(state_old[-1])
                        if power_selection == 2:
                            power_select_list_2.append(state_old[-1])
                        '''
                        self.merge_action([i, j], action)
                    if i % (len(self.env.vehicles) / 10) == 1:
                        action_temp = self.action_all_with_power.copy()
                        reward, percent = self.env.act_asyn(
                            action_temp)  # self.action_all)
                        Rate_list.append(np.sum(reward))
                # print("actions", self.action_all_with_power)
            '''
            number_0, bin_edges = np.histogram(power_select_list_0, bins = 10)

            number_1, bin_edges = np.histogram(power_select_list_1, bins = 10)

            number_2, bin_edges = np.histogram(power_select_list_2, bins = 10)


            p_0 = number_0 / (number_0 + number_1 + number_2)
            p_1 = number_1 / (number_0 + number_1 + number_2)
            p_2 = number_2 / (number_0 + number_1 + number_2)

            plt.plot(bin_edges[:-1]*0.1 + 0.01, p_0, 'b*-', label='Power Level 23 dB')
            plt.plot(bin_edges[:-1]*0.1 + 0.01, p_1, 'rs-', label='Power Level 10 dB')
            plt.plot(bin_edges[:-1]*0.1 + 0.01, p_2, 'go-', label='Power Level 5 dB')
            plt.xlim([0,0.12])
            plt.xlabel("Time left for V2V transmission (s)")
            plt.ylabel("Probability of power selection")
            plt.legend()
            plt.grid()
            plt.show()
            '''
            V2I_Rate_list[game_idx] = np.mean(np.asarray(Rate_list))
            Fail_percent_list[game_idx] = percent

            print('Mean of the V2I rate is that ',
                  np.mean(V2I_Rate_list[0:game_idx]))
            print('Mean of Fail percent is that ', percent,
                  np.mean(Fail_percent_list[0:game_idx]))
            # print('action is that', action_temp[0,:])

        print('The number of vehicle is ', len(self.env.vehicles))
        print('Mean of the V2I rate is that ', np.mean(V2I_Rate_list))
        print('Mean of Fail percent is that ', np.mean(Fail_percent_list))
def train(params):
    
    # Load Atari rom and prepare ALE environment 
    atari = GymEnvironment(params.random_start_wait, params.show_game)

    # Initialize two Q-Value Networks one for training and one for target prediction
    dqn_train  = DeepQNetwork(
        params=params,
        num_actions=atari.num_actions,
        network_name="qnetwork-train",
        trainable=True
    )

    # Q-Network for predicting target Q-values
    dqn_target= DeepQNetwork(
        params=params,
        num_actions=atari.num_actions,
        network_name="qnetwork-target",
        trainable=False
    )
    
    # Initialize replay memory for storing experience to sample batches from
    replay_mem = ReplayMemory(params.replay_capacity, params.batch_size)

    # Small structure for storing the last four screens
    history = ScreenHistory(params)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    replay_mem_dump   = os.path.abspath(os.path.join(params.output_dir, "replay_memory.hdf5"))
    checkpoint_dir    = os.path.abspath(os.path.join(params.output_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    train_step         = 0
    count_actions      = np.zeros(atari.num_actions)   # Count per action (only greedy)
    count_act_random   = 0  # Count of random actions
    count_act_greedy   = 0  # Count of greedy actions

    # Histories of qvalues and loss for running average
    qvalues_hist = collections.deque([0]*params.interval_summary,  maxlen=params.interval_summary)
    loss_hist    = collections.deque([10]*params.interval_summary, maxlen=params.interval_summary)

    # Time measurements
    dt_batch_gen    = collections.deque([0]*10, maxlen=10)
    dt_optimization = collections.deque([0]*10, maxlen=10)
    dt_train_total  = collections.deque([0]*10, maxlen=10)

    # Optionally load pre-initialized replay memory from disk
    if params.replay_mem_dump is not None and params.is_train:
        print("Loading pre-initialized replay memory from HDF5 file.")
        replay_mem.load(params.replay_mem_dump)


    # Initialize a new game and store the screens in the history
    reward, screen, is_terminal = atari.new_random_game()
    for _ in xrange(params.history_length):
        history.add(screen)

    # Initialize the TensorFlow session
    gpu_options = tf.GPUOptions(
       per_process_gpu_memory_fraction=0.4
    )

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        # Initialize the TensorFlow session
        init = tf.initialize_all_variables()
        sess.run(init)

        # Only save trainable variables and the global step to disk
        tf_vars_to_save = tf.trainable_variables() + [dqn_train.global_step]
        saver = tf.train.Saver(tf_vars_to_save, max_to_keep=40)

        if params.model_file is not None:
            # Load pre-trained model from disk
            saver.restore(sess, params.model_file)
            train_step, learning_rate = sess.run([dqn_train.global_step, dqn_train.learning_rate])
            print("Restarted training from model file. Step = %06i, Learning Rate = %.5f" % (train_step, learning_rate))


        # Initialize summary writer
        dqn_train.build_summary_writer(sess)

        # Initialize the target Q-Network fixed with the same weights
        update_target_network(sess, "qnetwork-train", "qnetwork-target")


        for step in xrange(params.num_steps):

            replay_mem_size = replay_mem.num_examples()
            if params.is_train and replay_mem_size < params.train_start and step % 1000 == 0:
                print("Initializing replay memory %i/%i" % (step, params.train_start))

            # Epsilon Greedy Exploration: with the probability of epsilon
            # choose a random action, otherwise go greedy with the action
            # having the maximal Q-value. Note the minimum episolon of 0.1
            if params.is_train:
                epsilon = max(0.1, 1.0-float(train_step*params.train_freq) / float(params.epsilon_step))
            else:
                epsilon = 0.05


            ################################################################
            ####################### SELECT A MOVE ##########################
            ################################################################

            # Either choose a random action or predict the action using the Q-network
            do_random_action = (random.random() < epsilon)
            if do_random_action or (replay_mem_size < params.train_start and params.is_train):
                action_id = random.randrange(atari.num_actions)
                count_act_random += 1
            else:

                # Get the last screens from the history and perform
                # feed-forward through the network to compute Q-values
                feed_dict  = { dqn_train.pl_screens: history.get() }
                qvalues    = sess.run(dqn_train.qvalues, feed_dict=feed_dict)

                # Choose the best action based on the approximated Q-values
                qvalue_max = np.max(qvalues[0])
                action_id  = np.argmax(qvalues[0])

                count_act_greedy += 1
                count_actions[action_id] += 1
                qvalues_hist.append(qvalue_max)


            ################################################################
            ####################### PLAY THE MOVE ##########################
            ################################################################

            # Play the selected action (either random or predicted) on the Atari game
            # Note that the action is performed for k = 4 frames (frame skipping)
            cumulative_reward, screen, is_terminal = atari.act(action_id)

            # Perform reward clipping and add the example to the replay memory
            cumulative_reward = min(+1.0, max(-1.0, cumulative_reward))

            # Add the screen to short term history and replay memory
            history.add(screen)

            # Add experience to replay memory
            if params.is_train:
                replay_mem.add(action_id, cumulative_reward, screen, is_terminal)

            # Check if we are game over, and if yes, initialize a new game
            if is_terminal:
                reward, screen, is_terminal = atari.new_random_game()
                replay_mem.add(0, reward, screen, is_terminal)
                history.add(screen)


            ################################################################
            ###################### TRAINING MODEL ##########################
            ################################################################


            if params.is_train and step > params.train_start and step % params.train_freq == 0:

                t1 = time.time()

                # Prepare batch and train the network
                # TODO: set actions with terminal == 1 to reward = -1 ??
                screens_in, actions, rewards, screens_out, terminals = replay_mem.sample_batch()

                dt_batch_gen.append(time.time() - t1)
                t2 = time.time()

                # Compute the target rewards from the previously fixed network
                # Note that the forward run is performed on the output screens.
                qvalues_target = sess.run(
                    dqn_target.qvalues,
                    feed_dict={ dqn_target.pl_screens: screens_out }
                )

                # Inputs for trainable Q-network
                feed_dict = {
                    dqn_train.pl_screens   : screens_in,
                    dqn_train.pl_actions   : actions,
                    dqn_train.pl_rewards   : rewards,
                    dqn_train.pl_terminals : terminals,
                    dqn_train.pl_qtargets  : np.max(qvalues_target, axis=1),

                }

                # Actual training operation
                _, loss, train_step = sess.run([dqn_train.train_op,
                                                dqn_train.loss,
                                                dqn_train.global_step],
                                                feed_dict=feed_dict)

                t3 = time.time()
                dt_optimization.append(t3 - t2)
                dt_train_total.append(t3 - t1)

                # Running average of the loss
                loss_hist.append(loss)

                 # Check if the returned loss is not NaN
                if np.isnan(loss):
                    print("[%s] Training failed with loss = NaN." %
                          datetime.now().strftime("%Y-%m-%d %H:%M"))


                # Once every n = 10000 frames update the Q-network for predicting targets
                if train_step % params.network_update_rate == 0:
                    print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M"))
                    update_target_network(sess, "qnetwork-train", "qnetwork-target")


                ################################################################
                ####################### MODEL EVALUATION #######################
                ################################################################

                if params.is_train and train_step % params.eval_frequency == 0:

                    eval_total_reward = 0
                    eval_num_episodes = 0
                    eval_num_rewards = 0
                    eval_episode_max_reward = 0
                    eval_episode_reward = 0
                    eval_actions = np.zeros(atari.num_actions)

                    # Initialize new game without random start moves
                    reward, screen, terminal = atari.new_game()
                    for _ in range(4):
                        history.add(screen)

                    for eval_step in range(params.eval_steps):

                        if random.random() < params.eval_epsilon:
                            # Random action
                            action_id = random.randrange(atari.num_actions)
                        else:
                            # Greedy action
                            # Get the last screens from the history and perform
                            # feed-forward through the network to compute Q-values
                            feed_dict_eval  = { dqn_train.pl_screens: history.get() }
                            qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict_eval)

                            # Choose the best action based on the approximated Q-values
                            qvalue_max = np.max(qvalues[0])
                            action_id  = np.argmax(qvalues[0])

                        # Keep track of how many of each action is performed
                        eval_actions[action_id] += 1

                        # Perform the action
                        reward, screen, terminal = atari.act(action_id)
                        history.add(screen)

                        eval_episode_reward += reward
                        if reward > 0:
                            eval_num_rewards += 1

                        if terminal:
                            eval_total_reward += eval_episode_reward
                            eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward)
                            eval_episode_reward = 0
                            eval_num_episodes += 1

                            reward, screen, terminal = atari.new_game()
                            for _ in range(4):
                                history.add(screen)

                    # Send statistics about the environment to TensorBoard
                    eval_update_ops = [
                        dqn_train.eval_rewards.assign(eval_total_reward),
                        dqn_train.eval_num_rewards.assign(eval_num_rewards),
                        dqn_train.eval_max_reward.assign(eval_episode_max_reward),
                        dqn_train.eval_num_episodes.assign(eval_num_episodes),
                        dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions))

                    ]
                    sess.run(eval_update_ops)
                    summaries = sess.run(dqn_train.eval_summary_op, feed_dict=feed_dict)
                    dqn_train.train_summary_writer.add_summary(summaries, train_step)

                    print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M"))
                    print("  Total Reward: %i" % eval_total_reward)
                    print("  Max Reward per Episode: %i" % eval_episode_max_reward)
                    print("  Num Episodes: %i" % eval_num_episodes)
                    print("  Num Rewards: %i" % eval_num_rewards)


                ################################################################
                ###################### PRINTING / SAVING #######################
                ################################################################

                # Write a training summary to disk
                if params.is_train and train_step % params.interval_summary == 0:

                    avg_dt_batch_gen    = sum(dt_batch_gen)    / float(len(dt_batch_gen))
                    avg_dt_optimization = sum(dt_optimization) / float(len(dt_optimization))
                    avg_dt_total        = sum(dt_train_total)  / float(len(dt_train_total))
                    # print("Avg. Time Batch Preparation: %.3f seconds" % avg_dt_batch_gen)
                    # print("Avg. Time Train Operation:   %.3f seconds" % avg_dt_train_op)
                    # print("Avg. Time Total per Batch:   %.3f seconds (%.2f samples/second)" %
                    #       (avg_dt_total, (1.0/avg_dt_total)*params.batch_size))

                    # Send statistics about the environment to TensorBoard
                    update_game_stats_ops = [
                        dqn_train.avg_reward_per_game.assign(atari.avg_reward_per_episode()),
                        dqn_train.max_reward_per_game.assign(atari.max_reward_per_episode),
                        dqn_train.avg_moves_per_game.assign(atari.avg_steps_per_episode()),
                        dqn_train.total_reward_replay.assign(replay_mem.total_reward()),
                        dqn_train.num_games_played.assign(atari.episode_number),
                        dqn_train.actions_random.assign(count_act_random),
                        dqn_train.actions_greedy.assign(count_act_greedy),
                        dqn_train.runtime_batch.assign(avg_dt_batch_gen),
                        dqn_train.runtime_train.assign(avg_dt_optimization),
                        dqn_train.runtime_total.assign(avg_dt_total),
                        dqn_train.samples_per_second.assign((1.0/avg_dt_total)*params.batch_size)
                    ]
                    sess.run(update_game_stats_ops)

                    # Build and save summaries
                    summaries = sess.run(dqn_train.train_summary_op, feed_dict=feed_dict)
                    dqn_train.train_summary_writer.add_summary(summaries, train_step)

                    avg_qvalue = avg_loss = 0
                    for i in xrange(len(qvalues_hist)):
                        avg_qvalue += qvalues_hist[i]
                        avg_loss   += loss_hist[i]

                    avg_qvalue /= float(len(qvalues_hist))
                    avg_loss   /= float(len(loss_hist))

                    format_str = "[%s] Step %06i, ReplayMemory = %i, Epsilon = %.4f, "\
                                 "Episodes = %i, Avg.Reward = %.2f, Max.Reward = %.2f, Avg.QValue = %.4f, Avg.Loss = %.6f"
                    print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), train_step,
                                        replay_mem.num_examples(), epsilon, atari.episode_number,
                                        atari.avg_reward_per_episode(), atari.max_reward_per_episode,
                                        avg_qvalue, avg_loss))

                    # For debugging purposes, dump the batch to disk
                    #print("[%s] Writing batch images to file (debugging)" %
                    #      datetime.now().strftime("%Y-%m-%d %H:%M"))
                    #batch_output_dir = os.path.join(params.output_dir, "batches/%06i/" % train_step)
                    #replay_mem.write_batch_to_disk(batch_output_dir, screens_in, actions, rewards, screens_out)

                # Write model checkpoint to disk
                if params.is_train and train_step % params.interval_checkpoint == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=train_step)
                    print("[%s] Saving TensorFlow model checkpoint to disk." %
                          datetime.now().strftime("%Y-%m-%d %H:%M"))

                    # Dump the replay memory to disk
                    # TODO: fix this!
                    # print("[%s] Saving replay memory to disk." %
                    #       datetime.now().strftime("%Y-%m-%d %H:%M"))
                    # replay_mem.save(replay_mem_dump)

                    sum_actions = float(reduce(lambda x, y: x+y, count_actions))
                    action_str = ""
                    for action_id, action_count in enumerate(count_actions):
                        action_perc = action_count/sum_actions if not sum_actions == 0 else 0
                        action_str += "<%i, %s, %i, %.2f> " % \
                                      (action_id, atari.action_to_string(action_id),
                                       action_count, action_perc)

                    format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s"
                    print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"),
                                        count_act_random, count_act_greedy, action_str))

        print("Finished training Q-network.")
Exemplo n.º 6
0
def train(sess, environment, actor, critic, embeddings, history_length,
          ra_length, buffer_size, batch_size, discount_factor, nb_episodes,
          filename_summary):
    ''' Algorithm 3 in article. '''

    # Set up summary operators
    def build_summaries():
        episode_reward = tf.Variable(0.)
        tf.summary.scalar('reward', episode_reward)
        episode_max_Q = tf.Variable(0.)
        tf.summary.scalar('max_Q_value', episode_max_Q)
        critic_loss = tf.Variable(0.)
        tf.summary.scalar('critic_loss', critic_loss)

        summary_vars = [episode_reward, episode_max_Q, critic_loss]
        summary_ops = tf.summary.merge_all()
        return summary_ops, summary_vars

    summary_ops, summary_vars = build_summaries()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(filename_summary, sess.graph)

    # '2: Initialize target network f′ and Q′'
    actor.init_target_network()
    critic.init_target_network()

    # '3: Initialize the capacity of replay memory D'
    replay_memory = ReplayMemory(buffer_size)  # Memory D in article
    replay = False

    start_time = time.time()
    for i_session in range(nb_episodes):  # '4: for session = 1, M do'
        session_reward = 0
        session_Q_value = 0
        session_critic_loss = 0

        # '5: Reset the item space I' is useless because unchanged.

        states = environment.reset(
        )  # '6: Initialize state s_0 from previous sessions'

        if (i_session +
                1) % 10 == 0:  # Update average parameters every 10 episodes
            environment.groups = environment.get_groups()

        exploration_noise = OrnsteinUhlenbeckNoise(history_length *
                                                   embeddings.size())

        for t in range(nb_rounds):  # '7: for t = 1, T do'
            # '8: Stage 1: Transition Generating Stage'

            # '9: Select an action a_t = {a_t^1, ..., a_t^K} according to Algorithm 2'
            actions = actor.get_recommendation_list(
                ra_length,
                states.reshape(
                    1, -1),  # TODO + exploration_noise.get().reshape(1, -1),
                embeddings).reshape(ra_length, embeddings.size())

            # '10: Execute action a_t and observe the reward list {r_t^1, ..., r_t^K} for each item in a_t'
            rewards, next_states = environment.step(actions)

            # '19: Store transition (s_t, a_t, r_t, s_t+1) in D'
            replay_memory.add(
                states.reshape(history_length * embeddings.size()),
                actions.reshape(ra_length * embeddings.size()), [rewards],
                next_states.reshape(history_length * embeddings.size()))

            states = next_states  # '20: Set s_t = s_t+1'

            session_reward += rewards

            # '21: Stage 2: Parameter Updating Stage'
            if replay_memory.size() >= batch_size:  # Experience replay
                replay = True
                replay_Q_value, critic_loss = experience_replay(
                    replay_memory, batch_size, actor, critic, embeddings,
                    ra_length, history_length * embeddings.size(),
                    ra_length * embeddings.size(), discount_factor)
                session_Q_value += replay_Q_value
                session_critic_loss += critic_loss

            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: session_reward,
                                       summary_vars[1]: session_Q_value,
                                       summary_vars[2]: session_critic_loss
                                   })

            writer.add_summary(summary_str, i_session)
            '''
            print(state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings),
                  state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings, True))
            '''

        str_loss = str('Loss=%0.4f' % session_critic_loss)
        print(('Episode %d/%d Reward=%d Time=%ds ' +
               (str_loss if replay else 'No replay')) %
              (i_session + 1, nb_episodes, session_reward,
               time.time() - start_time))
        start_time = time.time()

    writer.close()
    tf.train.Saver().save(sess, 'models.h5', write_meta_graph=False)
Exemplo n.º 7
0
        # YOUR CODE HERE
        if np.random.random()<=epsilon:
            action = np.random.randint(0,env.action_space.n)
        else:
            action = np.argmax(model.predict(obs[np.newaxis,:]))

        # step environment
        next_obs, reward, done, info = env.step(action)
        if args.render:
            env.render()

        # TODO: Add current experience to replay memory

        # YOUR CODE HERE
        replay_memory.add(obs,action,reward,done,next_obs)

        # statistics
        episode_reward += reward
        episode_length += 1
        # if episode ended
        if done:
            # reset environment
            obs = env.reset()

            # statistics
            episode_num += 1
            rewards.append(episode_reward)
            lengths.append(episode_length)
            episode_reward = 0
            episode_length = 0
Exemplo n.º 8
0
class Actor:
    def __init__(self, actor_id, n_actors, device='cpu'):
        # params
        self.gamma = 0.99
        self.epsilon = 0.4**(1 + actor_id * 7 / (n_actors - 1))
        self.bootstrap_steps = 3
        self.alpha = 0.6
        self.priority_epsilon = 1e-6
        self.device = device
        self.actor_id = actor_id

        # path
        self.memory_path = os.path.join('./', 'logs', 'memory')
        self.net_path = os.path.join('./', 'logs', 'model', 'net.pt')
        self.target_net_path = os.path.join('./', 'logs', 'model',
                                            'target_net.pt')

        # memory
        self.memory_size = 50000
        self.batch_size = 32
        self.action_repeat = 4
        self.n_stacks = 4
        self.stack_count = self.n_stacks // self.action_repeat
        self.memory_save_interval = 1
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size,
                                          self.bootstrap_steps)

        # net
        self.net_load_interval = 5
        self.net = QNet(self.net_path).to(self.device)
        self.target_net = QNet(self.target_net_path).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())

        # env
        self.env = PongEnv(self.action_repeat, self.n_stacks)
        self.episode_reward = 0
        self.n_episodes = 0
        self.n_steps = 0
        self.memory_count = 0
        self.state = self.env.reset()

    def run(self):
        while True:
            self.step()

    def step(self):
        state = self.state
        action = self.select_action(state)
        next_state, reward, done, _ = self.env.step(action)
        self.episode_reward += reward
        self.n_steps += 1

        self.n_steps_memory.add(state[-self.action_repeat:], action, reward,
                                self.stack_count)
        if self.stack_count > 1:
            self.stack_count -= 1

        if self.n_steps > self.bootstrap_steps:
            state, action, reward, stack_count = self.n_steps_memory.get()
            self.replay_memory.add(state, action, reward, done, stack_count)
            self.memory_count += 1
        self.state = next_state.copy()

        if done:
            while self.n_steps_memory.size > 0:
                state, action, reward, stack_count = self.n_steps_memory.get()
                self.replay_memory.add(state, action, reward, done,
                                       stack_count)
                self.memory_count += 1
            self.reset()

    def select_action(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.randint(6)
        else:
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                q_val = self.net(state)
                action = q_val.argmax().item()
        return action

    def reset(self):
        if self.n_episodes % 1 == 0:
            print('episodes:', self.n_episodes, 'actor_id:', self.actor_id,
                  'return:', self.episode_reward)

        self.calc_priority()
        self.state = self.env.reset()
        self.episode_reward = 0
        self.n_episodes += 1
        self.n_steps = 0
        self.memory_count = 0
        self.stack_count = self.n_stacks // self.action_repeat

        # reset n_step memory
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)

        # save replay memory
        if self.n_episodes % self.memory_save_interval == 0:
            self.replay_memory.save(self.memory_path, self.actor_id)
            self.replay_memory = ReplayMemory(self.memory_size,
                                              self.batch_size,
                                              self.bootstrap_steps)

        # load net
        if self.n_episodes % self.net_load_interval == 0:
            self.net.load()
            self.target_net.load()

    def calc_priority(self):
        last_index = self.replay_memory.size
        start_index = last_index - self.memory_count

        batch, index = self.replay_memory.indexing_sample(
            start_index, last_index, self.device)
        batch_size = batch['state'].shape[0]
        priority = np.zeros(batch_size, dtype=np.float32)

        mini_batch_size = 500
        for start_index in range(0, batch_size, mini_batch_size):
            last_index = min(start_index + mini_batch_size, batch_size)
            mini_batch = dict()
            for key in batch.keys():
                if key in ['reward', 'done']:
                    mini_batch[key] = batch[key][start_index:last_index]
                else:
                    mini_batch[key] = torch.tensor(
                        batch[key][start_index:last_index]).to(self.device)
            mini_batch['action'] = mini_batch['action'].view(-1, 1).long()

            with torch.no_grad():
                # q_value
                q_value = self.net(mini_batch['state']).gather(
                    1, mini_batch['action']).view(-1, 1).cpu().numpy()

                # taget_q_value
                next_action = torch.argmax(self.net(mini_batch['next_state']),
                                           1).view(-1, 1)
                next_q_value = self.target_net(
                    mini_batch['next_state']).gather(
                        1, next_action).cpu().numpy()

            target_q_value = mini_batch['reward'] + (
                self.gamma**
                self.bootstrap_steps) * next_q_value * (1 - mini_batch['done'])
            delta = np.abs(q_value -
                           target_q_value).reshape(-1) + self.priority_epsilon
            delta = delta**self.alpha
            priority[start_index:last_index] = delta

        self.replay_memory.update_priority(index, priority)
Exemplo n.º 9
0
class DQN(Agent):
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 target_update_frequency,
                 initial_replay_size,
                 train_frequency,
                 max_replay_size,
                 fit_params=None,
                 approximator_params=None,
                 n_approximators=1,
                 history_length=1,
                 clip_reward=True,
                 max_no_op_actions=0,
                 no_op_action_value=0,
                 p_mask=2 / 3.,
                 dtype=np.float32,
                 weighted_update=False):
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency // train_frequency
        self._max_no_op_actions = max_no_op_actions
        self._no_op_action_value = no_op_action_value
        self._p_mask = p_mask
        self.weighted_update = weighted_update
        self._replay_memory = ReplayMemory(mdp_info, initial_replay_size,
                                           max_replay_size, history_length,
                                           n_approximators, dtype)
        self._buffer = Buffer(history_length, dtype)

        self._n_updates = 0
        self._episode_steps = 0
        self._no_op_actions = None

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

        super(DQN, self).__init__(policy, mdp_info)

    def fit(self, dataset):

        mask = np.random.binomial(1,
                                  self._p_mask,
                                  size=(len(dataset), self._n_approximators))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask =\
                self._replay_memory.get(self._batch_size)

            q = np.array(self.approximator.predict(state))[0]
            q = q.reshape((self._n_approximators * self._batch_size, -1))
            q = q[np.arange(self._n_approximators * self._batch_size),
                  np.tile(action.ravel(), self._n_approximators)]
            q = q.reshape((self._n_approximators, self._batch_size)).T

            idxs = q.argsort()

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q_next_ordered = np.sort(q_next)
            #order target values to match the source values
            for i in range(idxs.shape[0]):
                q_next[i, idxs[i]] = q_next_ordered[i]

            q = reward.reshape(self._batch_size,
                               1) + self.mdp_info.gamma * q_next
            self.approximator.fit(state,
                                  action,
                                  q,
                                  mask=mask,
                                  **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                `next_state`.

        Returns:
            Maximum action-value for each state in `next_state`.

        """
        q = np.array(self.target_approximator.predict(next_state))[0]
        for i in range(q.shape[1]):
            if absorbing[i]:
                q[:, i, :] *= 1. - absorbing[i]

        if not self.weighted_update:
            #find best actions
            best_actions = np.argmax(np.mean(q, axis=0), axis=1)
            max_q = np.zeros((q.shape[1], q.shape[0]))
            for i in range(q.shape[1]):
                max_q[i, :] = q[:, i, best_actions[i]]
            return max_q
        else:
            N = q.shape[0]
            num_actions = q.shape[2]
            batch_size = q.shape[1]
            probs = np.zeros((batch_size, num_actions))
            weights = 1 / N
            #calculate probability of being maximum
            for b in range(batch_size):
                for i in range(num_actions):
                    particles = q[:, b, i]
                    p = 0
                    for k in range(N):
                        p2 = 1
                        p_k = particles[k]
                        for j in range(num_actions):
                            if (j != i):
                                particles2 = q[:, b, j]
                                p3 = 0
                                for l in range(N):
                                    if particles2[l] <= p_k:
                                        p3 += weights
                                p2 *= p3
                        p += weights * p2
                    probs[b, i] = p
            max_q = np.zeros((batch_size, N))
            for i in range(batch_size):
                particles = np.zeros(N)
                for j in range(num_actions):
                    particles += q[:, i, j] * probs[i, j]
                max_q[i, :] = particles
            return max_q

    def draw_action(self, state):
        self._buffer.add(state)

        if self._episode_steps < self._no_op_actions:
            action = np.array([self._no_op_action_value])
            self.policy.update_epsilon(state)
        else:
            extended_state = self._buffer.get()

            action = super(DQN, self).draw_action(extended_state)

        self._episode_steps += 1
        return action

    def episode_start(self):
        if self._max_no_op_actions == 0:
            self._no_op_actions = 0
        else:
            self._no_op_actions = np.random.randint(
                self._buffer.size, self._max_no_op_actions + 1)
        self._episode_steps = 0
        self.policy.set_idx(np.random.randint(self._n_approximators))
Exemplo n.º 10
0
class DQN_agent(Agent):
    """
    DQN agent implementation (for more details, look at )
    """
    def __init__(self,
                 image_params,
                 nb_action,
                 logger,
                 features=['health'],
                 variables=['ENNEMY'],
                 nb_dense=128,
                 optimizer_params={
                     'type': 'rmsprop',
                     'lr': 0.00002,
                     'clipvalue': 1
                 },
                 batch_size=64,
                 replay_memory={
                     'max_size': 10000,
                     'screen_shape': (84, 84)
                 },
                 decrease_eps=lambda epi: 0.05,
                 step_btw_train=64,
                 step_btw_save=2000,
                 depth=4,
                 episode_time=800,
                 frame_skip=4,
                 discount_factor=0.99):
        self.logger = logger
        self.batch_size = batch_size
        self.nb_action = nb_action
        self.replay_memory_p = replay_memory
        self.image_params = image_params
        self.nb_action = nb_action
        self.nb_dense = nb_dense
        self.optimizer_params = optimizer_params
        self.online_network = self.create_network(image_params, nb_dense,
                                                  nb_action, optimizer_params)
        self.target_network = self.online_network
        self.decrease_eps = decrease_eps
        self.step_btw_train = step_btw_train
        self.step_btw_save = step_btw_save

        self.features = features
        self.variables = variables
        self.image_size = replay_memory['screen_shape'][:2]
        self.depth = depth
        self.episode_time = episode_time
        self.frame_skip = frame_skip
        self.discount_factor = discount_factor

    def act_opt(self, eps, input_screen):
        """
        Choose action according to the eps-greedy policy using the network for inference
        Inputs : 
            eps : eps parameter for the eps-greedy policy
            goal : column vector encoding the goal for each timesteps and each measures
            screen : raw input from the game
            game_features : raw features from the game
        Returns an action coded by an integer
        """
        # eps-greedy policy used for exploration (if want full exploitation, just set eps to 0)
        if (np.random.rand() <
                eps) or (input_screen.shape[-1] <
                         4):  # if not enough episode collected, act randomly
            self.logger.info('input_screen shape is {}'.format(
                input_screen.shape))
            action = np.random.randint(0, self.nb_action)
            self.logger.info('random action : {}'.format(action))
        else:
            # use trained network to choose action
            #            print('using network')
            #            print('input dim : {}'.format(input_screen[None,:,:,:].shape))
            pred_q = self.online_network.predict(input_screen[None, :, :, :])
            self.logger.info('q values are : {}'.format(pred_q))
            action = np.argmax(pred_q)
            self.logger.info('opt action : {}'.format(action))
        return action

    def read_input_state(self,
                         screen,
                         last_states,
                         after=False,
                         MAX_RANGE=255.):
        """
        Use grey level image and specific image definition and stacked frames
        """
        screen_process = screen
        if len(screen.shape) == 3:
            if screen.shape[-1] != 3:
                screen = np.moveaxis(screen, 0, -1)
            screen_process = cv2.cvtColor(screen, cv2.COLOR_BGR2GRAY)
        input_screen = cv2.resize(screen_process, self.image_size)
        input_screen = input_screen / MAX_RANGE
        screen = np.stack(last_states[-(self.depth - 1):] + [input_screen],
                          axis=-1)
        if not after:
            last_states.append(input_screen)
            return screen
        else:
            return input_screen

    def train(self, map_id, experiment, nb_episodes):
        # variables
        nb_all_steps = 0
        self.list_reward_collected = []
        self.list_reward = []
        self.loss = []
        # create game from experiment
        experiment.start(map_id=map_id,
                         episode_time=self.episode_time,
                         log_events=False)

        # create replay memory
        self.replay_mem = ReplayMemory(self.replay_memory_p['max_size'],
                                       self.replay_memory_p['screen_shape'],
                                       type_network='DQN')

        # run the game
        for episode in range(nb_episodes):
            print('episode {}'.format(episode))
            self.logger.info('episode {}'.format(episode))

            if episode == 0:
                experiment.new_episode()
            else:
                experiment.reset()
                self.list_reward_collected.append(reward_collected)
                self.logger.info('eps_ellapsed is {}'.format(nb_step))
                print('reward collected is {}'.format(reward_collected))
                self.logger.info('last episode reward collected is {}'.format(
                    reward_collected))
            last_states = []
            reward_collected = 0
            nb_step = 0

            # decrease eps according to a fixed policy
            eps = self.decrease_eps(episode)
            self.logger.info('eps for episode {} is {}'.format(episode, eps))

            while not experiment.is_final():
                #                print(nb_step)
                # get screen and features from the game
                screen, variables, game_features = experiment.observe_state(
                    self.variables, self.features)

                # choose action
                input_screen = self.read_input_state(screen, last_states)
                action = self.act_opt(eps, input_screen)

                # make action and observe resulting state
                r, screen_next, variables_next, game_features_next = experiment.make_action(
                    action, self.variables, self.features, self.frame_skip)
                reward_collected += (self.discount_factor**nb_step) * r
                self.list_reward.append(r)
                if not experiment.is_final():
                    input_screen_next = self.read_input_state(
                        screen, last_states, True)
                else:
                    input_screen_next = None

                # save last processed screens / action in the replay memory
                self.replay_mem.add(screen1=last_states[-1],
                                    action=action,
                                    reward=r,
                                    is_final=experiment.is_final(),
                                    screen2=input_screen_next)

                # train network
                if nb_all_steps > self.depth - 1:
                    loss = self.train_network()
                    self.loss.append(loss)

                # change network when needed
                if (nb_all_steps % self.step_btw_train
                        == 0) and nb_step > self.depth - 1:
                    print('updating network')
                    self.logger.info('updating network')
                    self.target_network = self.create_network(
                        self.image_params, self.nb_dense, self.nb_action,
                        self.optimizer_params)
                    weight = self.online_network.get_weights()
                    self.target_network.set_weights(weight)

                # count nb of step since start
                nb_step += 1
                nb_all_steps += 1

        # save important features on-line
        if (episode % self.step_btw_save == 0) and (episode > 0):
            print('saving params')
            self.logger.info('saving params')
            saving_stats(episode, experiment.stats, self.online_network,
                         'DQN_{}'.format(experiment.scenario))
            with open('DQN_list_reward_eps_{}'.format(nb_all_steps)) as fp:
                pickle.dump(self.list_reward_collected, fp)

    def test(self, map_id, experiment, nb_episodes):
        """
        Test the trained bot
        """
        # variables
        nb_step = 0

        # create game from experiment
        experiment.start(map_id=map_id,
                         episode_time=self.episode_time,
                         log_events=False)

        for episode in range(nb_episodes):
            print('episode {}'.format(episode))
            if episode == 0:
                experiment.new_episode()
            else:
                experiment.reset()
            last_states = []

            while not experiment.is_final():
                #                print(nb_step)
                # get screen and features from the game
                screen, variables, game_features = experiment.observe_state(
                    self.variables, self.features)

                # decrease eps according to a fixed policy
                eps = self.decrease_eps(episode)

                # choose action
                input_screen = self.read_input_state(screen, last_states)
                #                print(input_screen.shape)
                action = self.act_opt(eps, input_screen)

                # make action and observe resulting state
                r, screen_next, variables_next, game_features_next = experiment.make_action(
                    action, self.variables, self.features, self.frame_skip)

                # count nb of step since start
                nb_step += 1

    def train_network(self):
        """
        Sample from the replay memory and trained the network with a simple batch on 
        these samples
        """
        batch = self.replay_mem.get_batch(self.batch_size, 3)
        input_screen1 = np.moveaxis(batch['screens1'], 1, -1)
        input_screen2 = np.moveaxis(batch['screens2'], 1, -1)
        reward = batch['rewards'][:, -1]
        isfinal = batch['isfinal'][:, -1]
        action = batch['actions'][:, -1]

        # compute target values
        q2 = np.max(self.target_network.predict(input_screen2), axis=1)
        #        print('q2 shape is {}'.format(q2.shape))
        target_q = self.online_network.predict(input_screen1)
        #        print('tq shape is {}'.format(target_q.shape))
        target_q[range(target_q.shape[0]),
                 action] = reward + self.discount_factor * (1 - isfinal) * q2

        # compute the gradient and update the weights
        loss = self.online_network.train_on_batch(input_screen1, target_q)

        return loss

    @staticmethod
    def create_network(image_params, nb_dense, nb_actions, optimizer_params):
        """
        create DQN network as described in paper from Mnih & al
        """
        # parse network inputs parameters
        screen_input_size, s1, s2, s3 = parse_image_params_dqn(image_params)

        # Define optimizer
        optimizer = get_optimizer(optimizer_params)

        # build network
        model = Sequential()
        model.add(
            Conv2D(s1['channel'], (s1['kernel'], s1['kernel']),
                   strides=(s1['stride'], s1['stride']),
                   input_shape=screen_input_size))  #84*84*4
        model.add(Activation('relu'))
        model.add(
            Conv2D(s2['channel'], (s2['kernel'], s2['kernel']),
                   strides=(s2['stride'], s2['stride'])))
        model.add(Activation('relu'))
        model.add(
            Conv2D(s3['channel'], (s3['kernel'], s3['kernel']),
                   strides=(s3['stride'], s3['stride'])))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(nb_dense))
        model.add(Activation('relu'))
        model.add(Dense(nb_actions))

        # compile model
        model.compile(loss='mse', optimizer=optimizer)

        return model
Exemplo n.º 11
0
Arquivo: dqn.py Projeto: czgdp1807/wql
class DQN(Agent):
    """
    Deep Q-Network algorithm.
    "Human-Level Control Through Deep Reinforcement Learning".
    Mnih V. et al.. 2015.
    """
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 initial_replay_size,
                 max_replay_size,
                 approximator_params,
                 target_update_frequency,
                 fit_params=None,
                 n_approximators=1,
                 clip_reward=True):
        """
        Constructor.
        Args:
            approximator (object): the approximator to use to fit the
               Q-function;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            approximator_params (dict): parameters of the approximator to
                build;
            target_update_frequency (int): the number of samples collected
                between each update of the target network;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;
            n_approximators (int, 1): the number of approximator to use in
                ``AverageDQN``;
            clip_reward (bool, True): whether to clip the reward or not.
        """
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency

        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        self._n_updates = 0

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train["name"] = "train"
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target["name"] = "target"
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in range(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super().__init__(policy, mdp_info)

    def fit(self, dataset):
        mask = np.ones((len(dataset), self._n_approximators))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask =\
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self.approximator.fit(state, action, q, **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.
        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.
        Returns:
            Maximum action-value for each state in ``next_state``.
        """
        q = self.target_approximator.predict(next_state)
        if np.any(absorbing):
            q *= 1 - absorbing.reshape(-1, 1)

        return np.max(q, axis=1)

    def draw_action(self, state):
        action = super(DQN, self).draw_action(np.array(state))

        return action
Exemplo n.º 12
0
        q_values = online_q_values.eval(
            feed_dict={X_state: [history.get()]})[0]
        action = epsilon_greedy(q_values, step)

        # Online DQN plays
        obs, reward, done, info = env.step(action)
        next_state = preprocess_observation(obs)

        # Reward clipping
        reward = max(min_reward, min(max_reward, reward))

        # Update history
        history.add(next_state)

        # Let's memorize what happened
        replay_memory.add(next_state, reward, action, done)
        state = next_state
        current_rewards.append(reward)

        if args.test:
            continue

        # Compute statistics for tracking progress (not shown in the book)
        total_max_q += q_values.max()
        game_length += 1
        if done:
            mean_max_q = total_max_q / game_length
            total_max_q = 0.0
            game_length = 0

        if iteration < training_start or iteration % args.learn_iterations != 0:
Exemplo n.º 13
0
class Agent(BaseModel):
    def __init__(self, config, environment, sess):
        super(Agent, self).__init__(config)

        # environment
        self.env = environment
        self.action_size = self.env.action_size

        # memory & history
        self.memory = ReplayMemory(self.config)
        self.history = History(self.config)

        # Session
        self.sess = sess

        self.build_dqn()

    def build_dqn(self):
        self.w = {}
        self.t_w = {}

        # build graph & ops
        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu

        with tf.variable_scope('prediction'):
            self.s_t = tf.placeholder('float32', [None, self.screen_height, self.screen_width, self.history_length], name='s_t')

            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t, 32, [8, 8], [4, 4], initializer, activation_fn, name='l1')
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1, 64, [4, 4], [2, 2], initializer, activation_fn, name='l2')
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2, 64, [3, 3], [1, 1], initializer, activation_fn, name='l3')

            l3_shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(self.l3, [-1, reduce(lambda x, y: x * y, l3_shape[1:])])

            self.l4, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat, 512, activation_fn=activation_fn, name='l4')
            self.q, self.w['q_w'], self.w['q_b'] = linear(self.l4, self.action_size, name='q')

            self.q_action = tf.argmax(self.q, dimension=1)

        with tf.variable_scope('target'):
            self.target_s_t = tf.placeholder('float32', \
                [None, self.screen_height, self.screen_width, self.history_length], name='target_s_t')

            self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = \
                conv2d(self.target_s_t, 32, [8, 8], [4, 4], initializer, activation_fn, name='t_l1')
            self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = \
                conv2d(self.target_l1, 64, [4, 4], [2, 2], initializer, activation_fn, name='t_l2')
            self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = \
                conv2d(self.target_l2, 64, [3, 3], [1, 1], initializer, activation_fn, name='t_l3')

            target_l3_shape = self.target_l3.get_shape().as_list()
            self.target_l3_flat = tf.reshape(self.target_l3, [-1, reduce(lambda x, y: x * y, target_l3_shape[1:])])

            self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \
                linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='t_l4')
            self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \
                linear(self.target_l4, self.action_size, name='t_q')

        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder('float32', [None], name='target_q_t')
            self.action = tf.placeholder('int64', [None], name='action')

            action_one_hot = tf.one_hot(self.action, self.action_size, 1.0, 0.0, name='action_one_hot')
            q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted')

            self.delta = self.target_q_t - q_acted
            clipped_delta = tf.clip_by_value(self.delta, self.min_delta, self.max_delta, name='clipped_delta')

            self.loss = tf.reduce_mean(tf.square(clipped_delta), name='loss')
            self.optm = tf.train.AdamOptimizer(1e-4).minimize(self.loss)

        with tf.variable_scope("update_target"):
            self.t_w_input = {}
            self.t_w_assign_op = {}

            for name in self.w.keys():
                self.t_w_input[name] = tf.placeholder('float32', self.t_w[name].get_shape().as_list(), name=name)
                self.t_w_assign_op[name] = self.t_w[name].assign(self.t_w_input[name])

        with tf.variable_scope('summary'):
            scalar_summary_tags = ['episode_max_reward', 'episode_min_reward', 'episode_avg_reward', \
                                   'average_reward', 'average_loss', 'average_q']

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag)
                self.summary_ops[tag] = tf.scalar_summary("%s/%s" % (self.env_name, tag), self.summary_placeholders[tag])

            self.writer = tf.train.SummaryWriter(tmpLogDir(), self.sess.graph)

        self.sess.run(tf.initialize_all_variables())
        self.update_target_q_network()

    def predict(self, s_t):
        if random.random() <  self.epsilon:
            action = random.randrange(self.env.action_size)
        else:
            action = self.sess.run(self.q_action, feed_dict={self.s_t: [s_t]})

        return action

    def observe(self, screen, reward, action, terminal):

        self.history.add(screen)
        self.memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == self.train_frequency - 1:
                self.q_learning()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_q_network()

    def q_learning(self):
        if self.memory.total_count < self.history_length:
            return

        s_t, action, reward, s_t_plus1, terminal = self.memory.sample()

        t_q_plus_1 = self.sess.run(self.target_q, feed_dict={self.target_s_t: s_t})

        terminal = terminal + 0.
        max_q_t_plus_1 = np.max(t_q_plus_1, axis=1)
        target_q_t = (1. - terminal) * self.discount * max_q_t_plus_1 + reward

        _, q_t, loss = self.sess.run([self.optm, self.q, self.loss], feed_dict={
            self.target_q_t : target_q_t,
            self.action : action,
            self.s_t : s_t
        })

        self.total_loss += loss
        self.total_q += q_t.mean()
        self.update_count += 1

    def update_target_q_network(self):
        for name in self.w.keys():
            self.t_w_assign_op[name].eval({self.t_w_input[name] : self.w[name].eval(session=self.sess)}, session=self.sess)

    def train(self):
        num_game, update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        ep_rewards = []

        screen, reward, terminal = self.env.new_game(bRandom=True)

        for _ in range(self.history_length):
            self.history.add(screen)

        for self.step in range(self.train_epoch):
            if self.step == self.learn_start:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards = []

            action = self.predict(self.history.get())

            screen, reward, terminal = self.env.act(action)

            self.observe(screen, reward, action, terminal)

            if terminal:
                screen, reward, terminal = self.env.new_game(bRandom=True)

                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.
            else:
                ep_reward += reward

            total_reward += reward

            if self.step >= self.learn_start and \
                self.step % self.test_frequency == self.test_frequency - 1:

                avg_reward = total_reward / self.test_frequency
                avg_loss = self.total_loss / self.update_count
                avg_q = self.total_q / self.update_count

                try:
                    max_ep_reward = np.max(ep_rewards)
                    min_ep_reward = np.min(ep_rewards)
                    avg_ep_reward = np.mean(ep_rewards)
                except:
                    max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                print "ep_max_reward %.4f, ep_min_reward %.4f, ep_avg_reward %.4f, avg_reward %.4f, avg_loss %.4f, avg_q %.4f " % \
                      (max_ep_reward, min_ep_reward, avg_ep_reward, avg_reward, avg_loss, avg_q)

                self.inject_summary({
                    'episode_max_reward' : max_ep_reward,
                    'episode_min_reward' : min_ep_reward,
                    'episode_avg_reward' : avg_ep_reward,
                    'average_reward' : avg_reward,
                    'average_loss' : avg_loss,
                    'average_q' : avg_q
                }, self.step)

                num_game = 0
                total_reward = 0.
                self.total_loss = 0.
                self.total_q = 0.
                self.update_count = 0
                ep_rewards = []

    def inject_summary(self, tag_dict, step):
        summary_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], \
            {self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
        })

        for summary_str in summary_lists:
            self.writer.add_summary(summary_str, step)
Exemplo n.º 14
0
class Agent:
    def __init__(self, config, env, sess):
        self.sess = sess
        self.env = env
        self.env_name = config.env_name
        self.env_type = config.env_type
        self.cnn_format = config.cnn_format
        self.batch_size, self.hist_len,  self.screen_h, self.screen_w = \
                config.batch_size, config.hist_len, config.screen_h, config.screen_w
        self.train_frequency = config.train_frequency
        self.target_q_update_step = config.target_q_update_step
        self.max_step = config.max_step
        self.test_step = config.test_step
        self.learn_start = config.learn_start
        self.min_delta = config.min_delta
        self.max_delta = config.max_delta
        self.learning_rate_minimum = config.learning_rate_minimum
        self.learning_rate = config.learning_rate
        self.learning_rate_decay_step = config.learning_rate_decay_step
        self.learning_rate_decay = config.learning_rate_decay
        self.is_train = config.is_train
        self.display = config.display
        self.double_q = config.double_q
        self.dueling = config.dueling

        if self.is_train:
            self.memory = ReplayMemory(config)
        self.history = np.zeros([self.hist_len, self.screen_h, self.screen_w], dtype=np.float32)

        self.ep_end = config.ep_end
        self.ep_start = config.ep_start
        self.ep_end_t = config.ep_end_t
        self.min_reward = config.min_reward
        self.max_reward = config.max_reward
        self.discount = config.discount

        self.step_op = tf.Variable(0, trainable=False, name='step')
        self.checkpoint_dir = os.path.join('checkpoints/', config.model_dir)
        self.summary_log_path = os.path.join('logs/', config.model_dir)

        self.build_graph()

    def train(self):
        start_step = self.step_op.eval()

        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []
        screen, reward, action, term = self.env.newRandomGame()

        for i in xrange(self.hist_len):
            self.history[i] = screen

        for self.step in tqdm(xrange(start_step, self.max_step), ncols=70, initial=start_step):
            if self.step == self.learn_start:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                max_avg_ep_reward = 0
                ep_rewards, actions = [], []
                #new game? because we start learning from middle of a game episode.

            action = self.predict(self.history)
            screen, reward, term = self.env.act(action)
            self.observe(screen, reward, action, term)

            if term:
                screen, reward, action, term = self.env.newRandomGame()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.0
            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward


            if self.step >= self.learn_start:
                if self.step % self.test_step == self.test_step - 1:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count  ##total_loss is updated in q_learn_mini_batch
                    avg_q = self.total_q / self.update_count  ##q is updated in q_learn_mini_batch
                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0
                    print '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \
                            % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)
                    if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                        self.step_op.assign(self.step + 1).eval()
                        self.save_model(self.step + 1)
                        self.memory.save()
                        #self.step_assign_op.eval({self.step_input: self.step + 1})
                        max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)
                    if self.step > 180:
                        #self.learning_rate_op.eval({self.learning_rate_step: self.step})
                        #inject summary
                        self.inject_summary({
                            'avg.reward': avg_reward,
                            'avg.loss': avg_loss,
                            'avg.q': avg_q,
                            'episode.max_reward': max_ep_reward,
                            'episode.min_reward': min_ep_reward,
                            'episode.avg_reward': avg_ep_reward,
                            'episode.num_of_game': num_game,
                            'training.learning_rate': self.learning_rate_op.eval({self.learning_rate_step: self.step}),
                        })
                    num_game, self.update_count, ep_reward = 0, 0, 0.
                    total_reward, self.total_loss, self.total_q = 0., 0., 0.
                    ep_rewards, actions = [], []

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t))
        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q_action.eval({self.s_t: [s_t]})[0]
        return action

    def observe(self, screen, reward, action, term):
        #add to history, memory
        #q_learn, update_target_q
        reward = max(self.min_reward, min(self.max_reward, reward))
        self.history[:-1] = self.history[1:]
        self.history[-1] = screen
        self.memory.add(screen, reward, action, term)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()
            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_q_network()


    def play(self, n_step=10000, n_episode=1):
        gym_dir = './video/%s-%s' % (self.env_name, time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime()))
        self.env.env.monitor.start(gym_dir)
        test_history = np.zeros([self.hist_len, self.screen_h, self.screen_w], dtype=np.float32)
        best_reward = 0
        for idx in xrange(n_episode):
            self.env.env.reset()
            screen, reward, action, term = self.env.newRandomGame()
            current_reward = 0
            for i in xrange(self.hist_len):
                test_history[i] = screen
            for s in xrange(n_step):
                #action = self.env.action_space_sample()
                action = self.predict(test_history, test_ep=0.05)
                screen, reward, term = self.env.act(action, is_training=False)
                test_history[:-1] = test_history[1:]
                test_history[-1] = screen
                current_reward += reward
                if self.display:
                    self.env.render()
                if term:
                    print 'step: %d' % s
                    break
            best_reward = max(best_reward, current_reward)
            print 'current_reward: %d, best_reward: %d' % (current_reward, best_reward)
        self.env.env.monitor.close()

    def createQNetwork(self, scope_name):
        init = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu
        w = {}

        with tf.variable_scope(scope_name):
            if self.cnn_format == 'NHWC':
                s_t = tf.placeholder('float32', 
                        [None, self.screen_h, self.screen_w, self.hist_len], name='s_t')
            else:
                s_t = tf.placeholder('float32',
                        [None, self.hist_len, self.screen_h, self.screen_w], name='s_t')
            l1, w['l1_w'], w['l1_b'] = conv2d(s_t,
                    32, [8,8], [4,4], init, activation_fn, self.cnn_format, name='l1')
            l2, w['l2_w'], w['l2_b'] = conv2d(l1,
                    64, [4,4], [2,2], init, activation_fn, self.cnn_format, name='l2')
            l3, w['l3_w'], w['l3_b'] = conv2d(l2,
                    64, [3,3], [1,1], init, activation_fn, self.cnn_format, name='l3')

            shape = l3.get_shape().as_list()
            l3_flat = tf.reshape(l3, [-1, reduce(lambda x,y: x*y, shape[1:])])

            if self.dueling:
                value_hid, w['l4_w'], w['l4_b'] = linear(l3_flat, 512, activation_fn=activation_fn, name='value_hid')
                adv_hid, w['l4_adv_w'], w['l4_adv_b'] = linear(l3_flat, 512, activation_fn=activation_fn, name='adv_hid')
                value, w['val_w_out'], w['val_b_out'] = linear(value_hid, 1, name='value_out')
                advantage, w['adv_w_out'], w['adv_b_out'] = linear(adv_hid, self.env.action_size, name='adv_out')
                # Average Dueling
                q = value + (advantage - tf.reduce_mean(advantage, reduction_indices=1, keep_dims=True))
            else:
                l4, w['l4_w'], w['l4_b'] = linear(l3_flat, 512, activation_fn=activation_fn, name='l4')
                q, w['q_w'], w['q_b'] = linear(l4, self.env.action_size, name='q')

            return s_t, w, q

    def build_graph(self):
        ###
        self.s_t, self.w, self.q = self.createQNetwork('prediction') ##self.q = max Q value
        self.q_action = tf.argmax(self.q, dimension=1)
        self.target_s_t, self.t_w, self.target_q = self.createQNetwork('target')
        self.target_q_idx = tf.placeholder('int32', [None, None], 'outputs_idx')
        self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx)

        q_summary = []
        avg_q = tf.reduce_mean(self.q, 0)
        for idx in xrange(self.env.action_size):
            q_summary.append(tf.histogram_summary('q/%s' % idx, avg_q[idx]))
        self.q_summary = tf.merge_summary(q_summary, 'q_summary')

        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder('float32', [None], name='target_q_t')
            self.action = tf.placeholder('int64', [None], name='action')

            action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name='action_one_hot')
            q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted')

            self.delta = self.target_q_t - q_acted
            self.clipped_delta = tf.clip_by_value(self.delta, self.min_delta, self.max_delta, name='clipped_delta')
            self.global_step = tf.Variable(0, trainable=False)

            self.loss = tf.reduce_mean(tf.square(self.clipped_delta), name='loss')
            self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step')
            self.learning_rate_op = tf.maximum(self.learning_rate_minimum,
                    tf.train.exponential_decay(
                        self.learning_rate,
                        self.learning_rate_step,
                        self.learning_rate_decay_step,
                        self.learning_rate_decay,
                        staircase=True))
            self.optim = tf.train.RMSPropOptimizer(
                    self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss)
            #self.optim = tf.train.RMSPropOptimizer(0.00025).minimize(self.loss)
            #self.optim = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)

        with tf.variable_scope('summary'):
            scalar_summary_tags = ['avg.reward', 'avg.loss', 'avg.q', \
                    'episode.max_reward', 'episode.min_reward', 'episode.avg_reward', \
                    'episode.num_of_game', 'training.learning_rate']
            self.summary_placeholders = {}
            self.summary_ops = {}
            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag)
                self.summary_ops[tag] = tf.scalar_summary("%s-%s/%s" % \
                        (self.env_name, self.env_type, tag), self.summary_placeholders[tag])

            hist_summary_tags = ['episode.rewards', 'episode.actions']
            for tag in hist_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag)
                self.summary_ops[tag] = tf.histogram_summary(tag, self.summary_placeholders[tag])


            self.writer = tf.train.SummaryWriter(self.summary_log_path, self.sess.graph)

        tf.initialize_all_variables().run()
        self.saver = tf.train.Saver(self.w.values() + [self.step_op], max_to_keep=30)
        self.load_model()
        if self.is_train:
            self.memory.load()
        self.update_target_q_network()

    def inject_summary(self, tag_dict):
        print 'inject summary!'
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], {
            self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
        })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, self.step)

    def load_model(self):
        print ("[*] Loading checkpoints...")
        ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            fname = os.path.join(self.checkpoint_dir, ckpt_name)
            self.saver.restore(self.sess, fname)
            print ("[*] Load SUCCESS: %s" % fname)
            return True
        else:
            print ("[*] Load FAILED: %s" % self.checkpoint_dir)
            return False

    def save_model(self, step):
        print ("[*] Saving checkpoints...")
        model_name = type(self).__name__
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        self.saver.save(self.sess, self.checkpoint_dir, global_step=step)

    def q_learning_mini_batch(self):
        if self.memory.count < self.hist_len:
            return
        else:
            s_t, action, reward, s_t_plus_1, term = self.memory.sample()

        if self.double_q:
            pred_action = self.q_action.eval({self.s_t: s_t_plus_1})
            term = np.array(term) + 0.0
            q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({
                self.target_s_t: s_t_plus_1,
                self.target_q_idx: [[idx, pred_a] for idx, pred_a in enumerate(pred_action)]
            })
            target_q_t = (1 - term) * self.discount * q_t_plus_1_with_pred_action + reward
        else:
            q_t_plus_1 = self.target_q.eval({self.target_s_t: s_t_plus_1})
            term = np.array(term) + 0.0
            max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
            target_q_t = (1 - term) * self.discount * max_q_t_plus_1 + reward

        _, q_t, loss, summary_str = self.sess.run([self.optim, self.q, self.loss, self.q_summary], {
            self.s_t: s_t,
            self.target_q_t: target_q_t,
            self.action: action,
            self.learning_rate_step: self.step,
        })

        self.writer.add_summary(summary_str, self.step)
        self.total_loss += loss
        self.total_q += q_t.mean()
        self.update_count += 1

    def update_target_q_network(self):
        print "update target network!"
        for name in self.w.keys():
            self.t_w[name].assign(self.w[name]).eval()
            #self.t_w_assign_op[name].eval({self.t_w_input[name]: self.w[name].eval()})

    '''def create_copy_op(self):
Exemplo n.º 15
0
class MADDPG:
    def __init__(self, n_agents, state_size, action_size, seed=299):
        self.seed = random.seed(seed)
        self.n_agents = n_agents
        self.action_size = action_size
        self.batch_size = BATCH_SIZE
        self.t_step = 0  # counter for activating learning every few steps

        self.actors_local = [
            Actor(state_size, action_size, seed).to(device)
            for _ in range(n_agents)
        ]
        self.actors_optimizer = [
            optim.Adam(x.parameters(), lr=LR_ACTOR) for x in self.actors_local
        ]

        self.critics_local = [
            Critic(state_size, action_size, n_agents, seed).to(device)
            for _ in range(n_agents)
        ]
        self.critics_optimizer = [
            optim.Adam(x.parameters(), lr=LR_CRITIC)
            for x in self.critics_local
        ]

        self.actors_target = [
            Actor(state_size, action_size, seed).to(device)
            for _ in range(n_agents)
        ]
        self.critics_target = [
            Critic(state_size, action_size, n_agents, seed).to(device)
            for _ in range(n_agents)
        ]

        self.var = [VAR for _ in range(n_agents)
                    ]  # variance for action exploration
        self.memory = ReplayMemory(BUFFER_SIZE, BATCH_SIZE)

    def act(self, all_states, mode='train'):
        """
		:param
			all_states (n_agents, state_size) (numpy): states of all agents
			mode (string): 'test' or 'train' mode
		:return:
			actions (n_agents, action_size) (numpy): actions of all agents
		"""
        actions = np.zeros((self.n_agents, self.action_size))

        for i in range(self.n_agents):
            state = torch.from_numpy(
                all_states[i, :]).unsqueeze(0).float().to(device)

            self.actors_local[i].eval()
            with torch.no_grad():
                act = self.actors_local[i](state).squeeze().cpu().data.numpy()
            self.actors_local[i].train()

            if mode == 'test':
                act = np.clip(act, -1, 1)

            if mode == 'train':
                noise = np.random.randn(self.action_size) * self.var[i]
                act = act + noise
                act = np.clip(act, -1, 1)
                if self.var[i] > 0.05:
                    self.var[
                        i] *= 0.999998  # decrease the noise variance after each step

            actions[i, :] = act

        return actions

    def step(self, states, actions, rewards, next_states, dones):
        self.memory.add(states, actions, rewards, next_states, dones)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            if len(self.memory) > BATCH_SIZE:
                for _ in range(LEARN_REPEAT):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):

        b_a_states, b_a_actions, b_a_next_states, b_rewards, b_dones = experiences

        all_states = b_a_states.view(self.batch_size,
                                     -1)  # (batch_size, all_obs)
        all_next_states = b_a_next_states.view(
            self.batch_size, -1)  # (batch_size, all_next_obs)
        all_actions = b_a_actions.view(self.batch_size,
                                       -1)  # (batch_size, all_act)

        # Get predicted next-state actions and Q values from target models
        for i in range(self.n_agents):
            # ---------------------------- update critic ---------------------------- #
            b_a_next_actions = [
                self.actors_target[k](b_a_next_states[:, k, :].squeeze(1))
                for k in range(self.n_agents)
            ]  # (n_agents, batch_size, state_size)

            b_a_next_actions = torch.stack(b_a_next_actions).float().to(device)

            b_a_next_actions = b_a_next_actions.permute(
                1, 0, 2)  # (batch_size, n_agents, state_size)
            all_next_actions = b_a_next_actions.contiguous().view(
                self.batch_size, -1)
            Q_targets_next = self.critics_target[i](
                all_next_states, all_next_actions)  # (batch_size, 1)

            # Compute Q targets for current states (y_i)
            Q_targets = b_rewards[:,
                                  i] + (gamma * Q_targets_next *
                                        (1 - b_dones[:, i]))  # (batch_size, 1)

            # Compute critic loss
            Q_expected = self.critics_local[i](all_states,
                                               all_actions)  # (batch_size, 1)
            critic_loss = F.mse_loss(Q_expected, Q_targets)

            # Minimize the loss
            self.critics_optimizer[i].zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critics_local[i].parameters(),
                                           1)
            self.critics_optimizer[i].step()

            # ------------------- update actor ------------------- #
            # Compute actor loss
            actions_pred = self.actors_local[i](
                b_a_states[:, i, :].squeeze(1))  # ( batch_size, action_size)
            new_b_a_actions = b_a_actions.clone(
            )  # 'clone' create tensor on the same device
            new_b_a_actions[:, i, :] = actions_pred
            new_all_actions = new_b_a_actions.view(self.batch_size, -1)
            actor_loss = -self.critics_local[i](
                all_states, new_all_actions).mean()  # (batch_size, 1)

            # Minimize the loss
            self.actors_optimizer[i].zero_grad()
            actor_loss.backward()
            self.actors_optimizer[i].step()

            # ------------------- update target network ------------------- #
            self.soft_update(self.critics_local[i], self.critics_target[i],
                             TAU)
            self.soft_update(self.actors_local[i], self.actors_target[i], TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
		θ_target = τ*θ_local + (1 - τ)*θ_target
		Params
		======
			local_model: PyTorch model (weights will be copied from)
			target_model: PyTorch model (weights will be copied to)
			tau (float): interpolation parameter
		"""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 16
0
class Driver(object):
    '''
    A driver object for the SCRC
    '''

    def __init__(self, args):
        '''Constructor'''
        self.WARM_UP = 0
        self.QUALIFYING = 1
        self.RACE = 2
        self.UNKNOWN = 3
        self.stage = args.stage
        
        self.parser = msgParser.MsgParser()
        self.state = carState.CarState()
        self.control = carControl.CarControl()

        self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0]
        self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0]
        self.num_inputs = 19
        self.num_steers = len(self.steers)
        self.num_speeds = len(self.speeds)
        self.num_actions = self.num_steers + self.num_speeds
        
        self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args)
        self.mem = ReplayMemory(args.replay_size, self.num_inputs, args)
        self.minibatch_size = args.batch_size

        if args.load_weights:
            self.net.load_weights(args.load_weights)
        self.save_weights_prefix = args.save_weights_prefix
        self.pretrained_network = args.pretrained_network

        self.steer_lock = 0.785398
        self.max_speed = 100

        self.algorithm = args.algorithm
        self.device = args.device
        self.mode = args.mode
        self.maxwheelsteps = args.maxwheelsteps
        
        self.enable_training = args.enable_training
        self.enable_exploration = args.enable_exploration

        self.total_train_steps = 0
        self.exploration_decay_steps = args.exploration_decay_steps
        self.exploration_rate_start = args.exploration_rate_start
        self.exploration_rate_end = args.exploration_rate_end

        self.show_sensors = args.show_sensors
        self.show_qvalues = args.show_qvalues

        self.episode = 0
        self.onRestart()
        
        if self.show_sensors:
            from sensorstats import Stats
            self.stats = Stats(inevery=8)
        
        if self.show_qvalues:
            from plotq import PlotQ
            self.plotq = PlotQ(self.num_steers, self.num_speeds)

        if self.device == 'wheel':
            from wheel import Wheel
            self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force)

    def init(self):
        '''Return init string with rangefinder angles'''
        self.angles = [0 for x in range(19)]
        
        for i in range(5):
            self.angles[i] = -90 + i * 15
            self.angles[18 - i] = 90 - i * 15
        
        for i in range(5, 9):
            self.angles[i] = -20 + (i-5) * 5
            self.angles[18 - i] = 20 - (i-5) * 5
        
        return self.parser.stringify({'init': self.angles})

    def getState(self):
        #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()])
        #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0
        state = np.array(self.state.getTrack()) / 200.0
        assert state.shape == (self.num_inputs,)
        return state

    def getReward(self, terminal):
        if terminal:
            reward = -1000
        else:
            dist = self.state.getDistFromStart()
            if self.prev_dist is not None:
                reward = max(0, dist - self.prev_dist) * 10
                assert reward >= 0, "reward: %f" % reward
            else:
                reward = 0
            self.prev_dist = dist
            
            #reward -= self.state.getTrackPos()
            #print "reward:", reward
        
        return reward

    def getTerminal(self):
        return np.all(np.array(self.state.getTrack()) == -1)

    def getEpsilon(self):
        # calculate decaying exploration rate
        if self.total_train_steps < self.exploration_decay_steps:
            return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
        else:
            return self.exploration_rate_end
 
    def drive(self, msg):
        # parse incoming message
        self.state.setFromMsg(msg)
        
        # show sensors
        if self.show_sensors:
            self.stats.update(self.state)
        
        # fetch state, calculate reward and terminal indicator  
        state = self.getState()
        terminal = self.getTerminal()
        reward = self.getReward(terminal)
        #print "reward:", reward

        # store new experience in replay memory
        if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None:
            self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal)

        # if terminal state (out of track), then restart game
        if terminal:
            print "terminal state, restarting"
            self.control.setMeta(1)
            return self.control.toMsg()
        else:
            self.control.setMeta(0)

        # choose actions for wheel and speed
        if self.enable_exploration and random.random() < self.getEpsilon():
            #print "random move"
            steer = random.randrange(self.num_steers)
            #speed = random.randrange(self.num_speeds)
            speed = random.randint(2, self.num_speeds-1)
        elif self.algorithm == 'network':
            # use broadcasting to efficiently produce minibatch of desired size
            minibatch = state + np.zeros((self.minibatch_size, 1))
            Q = self.net.predict(minibatch)
            assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape)
            #print "steer Q: ", Q[0,:21]
            #print "speed Q:", Q[0,-5:]
            steer = np.argmax(Q[0, :self.num_steers])
            speed = np.argmax(Q[0, -self.num_speeds:])
            if self.show_qvalues:
                self.plotq.update(Q[0])
        elif self.algorithm == 'hardcoded':
            steer = self.getSteerAction(self.steer())
            speed = self.getSpeedActionAccel(self.speed())
        else:
            assert False, "Unknown algorithm"
        #print "steer:", steer, "speed:", speed

        # gears are always automatic
        gear = self.gear()

        # check for manual override 
        # might be partial, so we always need to choose algorithmic actions first
        events = self.wheel.getEvents()
        if self.mode == 'override' and self.wheel.supportsDrive():
            # wheel
            for event in events:
                if self.wheel.isWheelMotion(event):
                    self.wheelsteps = self.maxwheelsteps

            if self.wheelsteps > 0:
                wheel = self.wheel.getWheel()
                steer = self.getSteerAction(wheel)
                self.wheelsteps -= 1

            # gas pedal
            accel = self.wheel.getAccel()
            if accel > 0:
                speed = self.getSpeedActionAccel(accel)
            
            # brake pedal
            brake = self.wheel.getBrake()
            if brake > 0:
                speed = self.getSpeedActionBrake(brake)

        # check for wheel buttons always, not only in override mode
        for event in events:
            if self.wheel.isButtonDown(event, 2):
                self.algorithm = 'network'
                self.mode = 'override'
                self.wheel.generateForce(0)
                print "Switched to network algorithm"
            elif self.wheel.isButtonDown(event, 3):
                self.net.load_weights(self.pretrained_network)
                self.algorithm = 'network'
                self.mode = 'ff'
                self.enable_training = False
                print "Switched to pretrained network"
            elif self.wheel.isButtonDown(event, 4):
                self.enable_training = not self.enable_training
                print "Switched training", "ON" if self.enable_training else "OFF"
            elif self.wheel.isButtonDown(event, 5):
                self.algorithm = 'hardcoded'
                self.mode = 'ff'
                print "Switched to hardcoded algorithm"
            elif self.wheel.isButtonDown(event, 6):
                self.enable_exploration = not self.enable_exploration
                self.mode = 'override'
                self.wheel.generateForce(0)
                print "Switched exploration", "ON" if self.enable_exploration else "OFF"
            elif self.wheel.isButtonDown(event, 7):
                self.mode = 'ff' if self.mode == 'override' else 'override'
                if self.mode == 'override':
                    self.wheel.generateForce(0)
                print "Switched force feedback", "ON" if self.mode == 'ff' else "OFF"
            elif self.wheel.isButtonDown(event, 0) or self.wheel.isButtonDown(event, 8):
                gear = max(-1, gear - 1)
            elif self.wheel.isButtonDown(event, 1) or self.wheel.isButtonDown(event, 9):
                gear = min(6, gear + 1)

        # set actions
        self.setSteerAction(steer)
        self.setGearAction(gear)
        self.setSpeedAction(speed)

        # turn wheel using force feedback
        if self.mode == 'ff' and self.wheel.supportsForceFeedback():
            wheel = self.wheel.getWheel()
            self.wheel.generateForce(self.control.getSteer()-wheel)

        # remember state and actions 
        self.prev_state = state
        self.prev_steer = steer
        self.prev_speed = speed

        # training
        if self.enable_training and self.mem.count >= self.minibatch_size:
            minibatch = self.mem.getMinibatch()
            self.net.train(minibatch)
            self.total_train_steps += 1
            #print "total_train_steps:", self.total_train_steps

        #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count

        return self.control.toMsg()

    def setSteerAction(self, steer):
        self.control.setSteer(self.steers[steer])

    def setGearAction(self, gear):
        assert -1 <= gear <= 6
        self.control.setGear(gear)

    def setSpeedAction(self, speed):
        accel = self.speeds[speed]
        if accel >= 0:
            #print "accel", accel
            self.control.setAccel(accel)
            self.control.setBrake(0)
        else:
            #print "brake", -accel
            self.control.setAccel(0)
            self.control.setBrake(-accel)

    def getSteerAction(self, wheel):
        steer = np.argmin(np.abs(np.array(self.steers) - wheel))
        return steer

    def getSpeedActionAccel(self, accel):
        speed = np.argmin(np.abs(np.array(self.speeds) - accel))
        return speed

    def getSpeedActionBrake(self, brake):
        speed = np.argmin(np.abs(np.array(self.speeds) + brake))
        return speed

    def steer(self):
        angle = self.state.angle
        dist = self.state.trackPos
        
        steer = (angle - dist*0.5)/self.steer_lock
        return steer
    
    def gear(self):
        rpm = self.state.getRpm()
        gear = self.state.getGear()
        
        if self.prev_rpm == None:
            up = True
        else:
            if (self.prev_rpm - rpm) < 0:
                up = True
            else:
                up = False
        
        if up and rpm > 7000:
            gear += 1
        
        if not up and rpm < 3000:
            gear -= 1
        
        return gear

    def speed(self):
        speed = self.state.getSpeedX()
        accel = self.prev_accel
        
        if speed < self.max_speed:
            accel += 0.1
            if accel > 1:
                accel = 1.0
        else:
            accel -= 0.1
            if accel < 0:
                accel = 0.0
        
        self.prev_accel = accel
        return accel
        
    def onShutDown(self):
        pass
    
    def onRestart(self):
        if self.mode == 'ff':
            self.wheel.generateForce(0)
    
        self.prev_rpm = None
        self.prev_accel = 0
        self.prev_dist = None
        self.prev_state = None
        self.prev_steer = None
        self.prev_speed = None
        self.wheelsteps = 0

        if self.save_weights_prefix and self.episode > 0:
            self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl")

        self.episode += 1
        print "Episode", self.episode
Exemplo n.º 17
0
class QAgent:
    """An environment class for open ai gym atari games using the screen.

    Attributes:
        _display : bool
            Display the game visually
        _screen (:obj: 'array', :obj: 'float') : The screen output (rgb)
        _reward (float) : amount of reward achieved by the previous action. 
                          The scale varies between environments, 
                          but the goal is always to increase your total reward.
        _done (bool) : Whether it's time to reset the environment again. 
                       Most (but not all) tasks are divided up into well-defined
                       episodes, and done being True indicates the episode has 
                       terminated.
        _random_start (int) : How long we let the agent take random actions in a
                              new game.
        screen_width (int) : The width of the screen after resizing.
        screen_height (int) : The height of the screen after resizing.
        _action_repeat (int) : The number of time-steps an action is repeated.
        env (:obj:) : The open ai gym environment object
    """
    def __init__(self, params):

        self.params = params  # These are the parameters collected for the agent.

        # Load environmnet

        self.game = MinesweeperEnvironment(self.params.input_height,
                                           self.params.input_width,
                                           self.params.mines_min,
                                           self.params.mines_max,
                                           self.params.show_game,
                                           self.params.reward_recent_update)

        # Initialize two Q-Value Networks
        # Q-network for training.

        self.dqn_train = DeepQNetwork(params=self.params,
                                      num_actions=self.game.num_actions,
                                      network_name="qnetwork-train",
                                      trainable=True)

        if self.params.is_train:

            # Q-Network for predicting target Q-values
            self.dqn_target = DeepQNetwork(params=self.params,
                                           num_actions=self.game.num_actions,
                                           network_name="qnetwork-target",
                                           trainable=False)

            # Initialize replay memory for storing experience to sample batches from
            self.replay_mem = ReplayMemory(
                self.params.replay_capacity, self.params.history_length,
                self.params.nchannels, self.params.batch_size,
                self.params.input_height, self.params.input_width,
                self.params.game, self.params.memory_checkpoint,
                self.params.restore_memory, self.params.output_dir)

        # Small structure for storing the last four screens
        self.history = ScreenHistory(self.params)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        self.checkpoint_dir = os.path.abspath(
            os.path.join(self.params.output_dir,
                         "checkpoints_" + self.params.game))
        self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "model")
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.train_iteration = 0
        self.count_actions = np.zeros(
            self.game.num_actions)  # Count per action (only greedy)
        self.count_act_random = 0  # Count of random actions
        self.count_act_greedy = 0  # Count of greedy actions
        self.win_rate = 0.0  # For atari

        # Histories of qvalues and loss for running average
        self.qvalues_hist = collections.deque(
            [0] * self.params.interval_summary,
            maxlen=self.params.interval_summary)
        self.loss_hist = collections.deque([10] * self.params.interval_summary,
                                           maxlen=self.params.interval_summary)

        self.epsilon = 0

    def fit(self):

        screen, reward, is_done = self.game.new_game()
        for _ in range(self.params.history_length):
            self.history.add(screen)

        # Initialize the TensorFlow session
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.params.gpu_memory)

        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options)) as sess:

            # Initialize the TensorFlow session
            init = tf.global_variables_initializer()
            sess.run(init)

            # Only save trainable variables and the global iteration to disk
            tf_vars_to_save = tf.trainable_variables() + [
                self.dqn_train.global_iteration
            ]
            saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200)

            if self.params.model_file is not None:
                # Load pre-trained model from disk
                model_path = os.path.join(self.checkpoint_dir,
                                          self.params.model_file)
                saver.restore(sess, model_path)
                self.train_iteration, learning_rate = sess.run([
                    self.dqn_train.global_iteration,
                    self.dqn_train.learning_rate
                ])
                print(
                    "Restarted training from model file. iteration = %06i, Learning Rate = %.5f"
                    % (self.train_iteration, learning_rate))

            # Initialize summary writer
            self.dqn_train.build_summary_writer(sess)

            # Initialize the target Q-Network fixed with the same weights
            update_target_network(sess, "qnetwork-train", "qnetwork-target")

            for iteration in range(
                    self.params.num_iterations
            ):  # Iteration is also how many times we added to replay
                # self.train_iteration is the true train iteration
                self._sel_move(sess, iteration)
                self._train(sess, iteration, saver)

            print("Finished training Q-network.")

    def _sel_move(self, sess, iteration):

        if self.params.is_train:
            replay_mem_size = self.replay_mem.num_examples()
            if replay_mem_size < self.params.train_start and iteration % 1000 == 0:
                print("Initializing replay memory %i/%i" %
                      (iteration, self.params.train_start))

        # self.epsilon Greedy Exploration: with the probability of self.epsilon
        # choose a random action, otherwise go greedy with the action
        # having the maximal Q-value. Note the minimum episolon of 0.1
        if self.params.is_train:
            self.epsilon = max(
                self.params.min_epsilon,
                1.0 - float(self.train_iteration * self.params.train_freq) /
                float(self.params.epsilon_step))
        else:
            self.epsilon = self.params.eval_epsilon

        ################################################################
        ####################### SELECT A MOVE ##########################
        ################################################################

        # Either choose a random action or predict the action using the Q-network
        do_random_action = (random.random() < self.epsilon)
        if do_random_action or (self.params.is_train
                                and replay_mem_size < self.params.train_start):
            action_id = random.randrange(self.game.num_actions)
            self.count_act_random += 1
        else:

            # Get the last screens from the self.history and perform
            # feed-forward through the network to compute Q-values
            feed_dict = {self.dqn_train.pl_screens: self.history.get()}

            qvalues = sess.run(self.dqn_train.qvalues, feed_dict=feed_dict)

            # Choose the best action based on the approximated Q-values
            qvalue_max = np.max(qvalues[0])
            action_id = np.argmax(qvalues[0])

            self.count_act_greedy += 1
            self.count_actions[action_id] += 1
            self.qvalues_hist.append(qvalue_max)

        self._move(action_id)

    def _move(self, action_id):

        ################################################################
        ####################### PLAY THE MOVE ##########################
        ################################################################

        # Play the selected action (either random or predicted) on the self.game game
        # Note that the action is performed for k = 4 frames (frame skipping)
        screen, cumulative_reward, is_done = self.game.act(action_id)

        # Perform reward clipping and add the example to the replay memory
        # This is done with Huber loss now
        #cumulative_reward = min(+1.0, max(-1.0, cumulative_reward))

        # Add the screen to short term self.history and replay memory
        self.history.add(screen)

        # Add experience to replay memory
        if self.params.is_train:
            self.replay_mem.add(action_id, cumulative_reward, screen, is_done)

        # Check if we are game over, and if yes, initialize a new game
        if is_done:
            screen, reward, is_done = self.game.new_game()
            if self.params.is_train:
                self.replay_mem.add(0, reward, screen, is_done)
                self.history.add(screen)

    def _train(self, sess, iteration, saver):

        ################################################################
        ###################### TRAINING MODEL ##########################
        ################################################################

        if self.params.is_train and iteration > self.params.train_start and iteration % self.params.train_freq == 0:

            screens, actions, rewards, screens_1, dones = self.replay_mem.sample_batch(
            )

            # Below, we perform the Double-DQN update.

            # First, we need to determine the best actions
            # in the train network
            qvalues_train = sess.run(
                self.dqn_train.qvalues,
                feed_dict={self.dqn_train.pl_screens: screens_1})

            # Find the best actions for each using the train network
            # which will be used with the q-values form the target network
            actions_target = np.argmax(qvalues_train, 1)

            # We use this to evalute the q-value for some state
            # Now,we get the q-values for all actions given the states
            # We then later sort out the q-values from the target network
            # using the best actions from the train network

            qvalues_target = sess.run(
                self.dqn_target.qvalues,
                feed_dict={self.dqn_target.pl_screens: screens_1})

            # Inputs for trainable Q-network
            feed_dict = {
                self.dqn_train.pl_screens:
                screens,
                self.dqn_train.pl_actions:
                actions,
                self.dqn_train.pl_rewards:
                rewards,
                self.dqn_train.pl_dones:
                dones,
                #self.dqn_train.pl_qtargets  : np.max(qvalues_target, axis=1),
                self.dqn_train.pl_qtargets:
                qvalues_target,
                self.dqn_train.pl_actions_target:
                actions_target,
            }

            # Actual training operation
            _, loss, self.train_iteration = sess.run([
                self.dqn_train.train_op, self.dqn_train.loss,
                self.dqn_train.global_iteration
            ],
                                                     feed_dict=feed_dict)

            # Running average of the loss
            self.loss_hist.append(loss)

            # Check if the returned loss is not NaN
            if np.isnan(loss):
                print("[%s] Training failed with loss = NaN." %
                      datetime.now().strftime("%Y-%m-%d %H:%M"))

            # Once every n = 10000 frames update the Q-network for predicting targets
            if self.train_iteration % self.params.network_update_rate == 0:
                print("[%s] Updating target network." %
                      datetime.now().strftime("%Y-%m-%d %H:%M"))
                update_target_network(sess, "qnetwork-train",
                                      "qnetwork-target")

            self._evaluate(sess, feed_dict)
            self._print_save(sess, feed_dict, saver)

    def _evaluate(self, sess, feed_dict):

        ################################################################
        ####################### MODEL EVALUATION #######################
        ################################################################

        if self.params.is_train and self.train_iteration % self.params.eval_frequency == 0 or self.train_iteration == 0:

            eval_total_reward = 0
            eval_num_episodes = 0
            eval_num_wins = 0
            eval_num_rewards = 0
            eval_episode_max_reward = 0
            eval_episode_reward = 0
            eval_actions = np.zeros(self.game.num_actions)

            # We store all of these parameters temporarily so this evaluation does not
            # affect model evaluation

            tmp_episode_step = self.game._episode_step
            tmp_episode_number = self.game._episode_number
            tmp_episode_reward = self.game._episode_reward
            tmp_max_reward_episode = self.game._max_reward_episode
            tmp_global_step = self.game._global_step
            tmp_global_reward = self.game._global_reward
            tmp_recent_reward = self.game._recent_reward
            tmp_recent_episode_number = self.game._recent_episode_number
            tmp_recent_games_won = self.game._recent_games_won
            tmp_games_won = self.game._games_won
            tmp_reward_recent_update = self.game.reward_recent_update

            prev_action_id = -1
            prev_episode_num = -1  # Just has to be different intially than prev
            action_id = -1
            eval_num_episodes = 0

            # Initialize new game without random start moves
            screen, reward, done = self.game.new_game()

            for _ in range(self.params.history_length):
                self.history.add(screen)

            #for eval_iterations in range(self.params.eval_iterations):
            while eval_num_episodes < self.params.eval_iterations:  # Play eval_iterations games
                prev_action_id = action_id

                # if random.random() < self.params.eval_epsilon:
                #     # Random action
                #     action_id = random.randrange(self.game.num_actions)
                #else:
                # Greedy action
                # Get the last screens from the self.history and perform
                # feed-forward through the network to compute Q-values
                feed_dict_eval = {
                    self.dqn_train.pl_screens: self.history.get()
                }
                qvalues = sess.run(self.dqn_train.qvalues,
                                   feed_dict=feed_dict_eval)

                # Choose the best action based on the approximated Q-values
                qvalue_max = np.max(qvalues[0])
                action_id = np.argmax(qvalues[0])

                # Skip this action if we are in the same game
                if prev_action_id == action_id and prev_episode_num == eval_num_episodes:
                    action_id = random.randrange(self.game.num_actions)

                prev_episode_num = eval_num_episodes

                # Keep track of how many of each action is performed
                eval_actions[action_id] += 1

                # Perform the action
                screen, reward, done = self.game.act(action_id)
                self.history.add(screen)

                eval_episode_reward += reward
                if reward > 0:
                    eval_num_rewards += 1

                if reward == self.game.env.rewards["win"]:
                    eval_num_wins += 1

                if done:
                    # Note max reward is from playin gthe games
                    eval_total_reward += eval_episode_reward
                    eval_episode_max_reward = max(eval_episode_reward,
                                                  eval_episode_max_reward)
                    eval_episode_reward = 0
                    eval_num_episodes += 1

                    screen, reward, done = self.game.new_game()
                    for _ in range(self.params.history_length):
                        self.history.add(screen)

            # Send statistics about the environment to TensorBoard
            eval_update_ops = [
                self.dqn_train.eval_rewards.assign(eval_total_reward),
                self.dqn_train.eval_win_rate.assign(
                    (eval_num_wins / eval_num_episodes) * 100),
                self.dqn_train.eval_num_rewards.assign(eval_num_rewards),
                self.dqn_train.eval_max_reward.assign(eval_episode_max_reward),
                self.dqn_train.eval_num_episodes.assign(eval_num_episodes),
                self.dqn_train.eval_actions.assign(eval_actions /
                                                   np.sum(eval_actions))
            ]
            sess.run(eval_update_ops)
            summaries = sess.run(self.dqn_train.eval_summary_op,
                                 feed_dict=feed_dict)
            self.dqn_train.train_summary_writer.add_summary(
                summaries, self.train_iteration)

            print("[%s] Evaluation Summary" %
                  datetime.now().strftime("%Y-%m-%d %H:%M"))
            print("  Total Reward: %i" % eval_total_reward)
            print("  Max Reward per Episode: %i" % eval_episode_max_reward)
            print("  Num Episodes: %i" % eval_num_episodes)
            print("  Num Rewards: %i" % eval_num_rewards)
            print("  Win Rate: %.1f" %
                  ((eval_num_wins / eval_num_episodes) * 100))

            self.win_rate = (eval_num_wins / eval_num_episodes) * 100

            self.game._episode_step = tmp_episode_step
            self.game._episode_number = tmp_episode_number
            self.game._episode_reward = tmp_episode_reward
            self.game._max_reward_episode = tmp_max_reward_episode
            self.game._global_step = tmp_global_step
            self.game._global_reward = tmp_global_reward
            self.game._recent_reward = tmp_recent_reward
            self.game._recent_episode_number = tmp_recent_episode_number
            self.game._recent_games_won = tmp_recent_games_won
            self.game._games_won = tmp_games_won
            self.game.reward_recent_update = tmp_reward_recent_update

    def _print_save(self, sess, feed_dict, saver):

        ################################################################
        ###################### PRINTING / SAVING #######################
        ################################################################

        # Write a training summary to disk
        # This is what controls how often we write to disk
        if self.params.is_train and self.train_iteration % self.params.interval_summary == 0:

            # Send statistics about the environment to TensorBoard
            update_game_stats_ops = [
                self.dqn_train.avg_reward_per_game.assign(
                    self.game.avg_reward_per_episode()),
                self.dqn_train.max_reward_per_game.assign(
                    self.game.max_reward_per_episode),
                self.dqn_train.avg_moves_per_game.assign(
                    self.game.avg_steps_per_episode()),
                self.dqn_train.total_reward_replay.assign(
                    self.replay_mem.total_reward()),
                self.dqn_train.num_games_played.assign(
                    self.game.episode_number),
                self.dqn_train.moves.assign(self.game.global_step),
                self.dqn_train.actions_random.assign(self.count_act_random),
                self.dqn_train.actions_greedy.assign(self.count_act_greedy),
            ]
            sess.run(update_game_stats_ops)

            # Build and save summaries
            summaries = sess.run(self.dqn_train.train_summary_op,
                                 feed_dict=feed_dict)

            # Here we set train_iteration on x-axis
            self.dqn_train.train_summary_writer.add_summary(
                summaries, self.train_iteration)

            # Here we set number of moves on x-axis
            #self.dqn_train.train_summary_writer.add_summary(summaries, self.game.global_step)

            avg_qvalue = avg_loss = 0
            for i in range(len(self.qvalues_hist)):
                avg_qvalue += self.qvalues_hist[i]
                avg_loss += self.loss_hist[i]

            avg_qvalue /= float(len(self.qvalues_hist))
            avg_loss /= float(len(self.loss_hist))

            learning_rate = sess.run(self.dqn_train.learning_rate)

            format_str = "[%s] It. %06i, Replay = %i, epsilon = %.4f, "\
                         "Episodes = %i, Steps = %i, Avg.R = %.3f, "\
                         "Max.R = %.3f, Win = %.1f, Avg.Q = %.4f, Avg.Loss = %.6f, lr = %.6f"
            print(format_str %
                  (datetime.now().strftime("%Y-%m-%d %H:%M"),
                   self.train_iteration, self.replay_mem.num_examples(),
                   self.epsilon, self.game.episode_number,
                   self.game.global_step, self.game.avg_reward_per_episode(),
                   self.game.max_reward_per_episode, self.win_rate, avg_qvalue,
                   avg_loss, learning_rate))

        # Write model checkpoint to disk
        if self.params.is_train and self.train_iteration % self.params.interval_checkpoint == 0:
            path = saver.save(sess,
                              self.checkpoint_prefix,
                              global_step=self.train_iteration)
            print("[%s] Saving TensorFlow model checkpoint to disk." %
                  datetime.now().strftime("%Y-%m-%d %H:%M"))

            sum_actions = float(reduce(lambda x, y: x + y, self.count_actions))
            action_str = ""
            for action_id, action_count in enumerate(self.count_actions):
                action_perc = action_count / sum_actions if not sum_actions == 0 else 0
                action_str += "<%i, %s, %i, %.2f> " % \
                              (action_id, self.game.action_to_string(action_id),
                               action_count, action_perc)

            format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s"
            print(format_str %
                  (datetime.now().strftime("%Y-%m-%d %H:%M"),
                   self.count_act_random, self.count_act_greedy, action_str))

    def play_mine(self):

        # Initialize a new game and store the screens in the self.history
        screen, reward, is_done = self.game.new_game()
        for _ in range(self.params.history_length):
            self.history.add(screen)

        # Initialize the TensorFlow session
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.params.gpu_memory)

        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options)) as sess:

            # Initialize the TensorFlow session
            init = tf.global_variables_initializer()
            sess.run(init)

            # Only save trainable variables and the global iteration to disk
            tf_vars_to_save = tf.trainable_variables() + [
                self.dqn_train.global_iteration
            ]
            saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200)

            if self.params.model_file is not None:
                # Load pre-trained model from disk
                model_path = os.path.join(self.checkpoint_dir,
                                          self.params.model_file)
                saver.restore(sess, model_path)

            while self.game.episode_number < self.params.num_games:
                if self.params.show_game:
                    inp = input("Enter input (ROW,COL)")
                self._sel_move(sess, 0)

            print(self.game.episode_number)

            print(self.game.win_rate)

    def evaluate_mine(self):

        # Initialize a new game and store the screens in the self.history
        screen, reward, is_done = self.game.new_game()
        for _ in range(self.params.history_length):
            self.history.add(screen)

        # Initialize the TensorFlow session
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.params.gpu_memory)

        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options)) as sess:
            max_name = 800000
            min_name = 680000
            current_name = min_name
            best_model = min_name
            best_win_rate = 0
            current_win_rate = 0

            # Initialize the TensorFlow session
            init = tf.global_variables_initializer()
            sess.run(init)
            # Only save trainable variables and the global iteration to disk
            tf_vars_to_save = tf.trainable_variables() + [
                self.dqn_train.global_iteration
            ]
            saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200)

            while current_name <= max_name:

                print("Restoring: ", current_name)

                # if self.params.model_file is not None:
                #     # Load pre-trained model from disk
                #     model_path = os.path.join(self.checkpoint_dir, self.params.model_file)
                #     saver.restore(sess, model_path)
                model_path = os.path.join(self.checkpoint_dir,
                                          'model-' + str(current_name))
                saver.restore(sess, model_path)

                prev_action_id = -1
                prev_episode_num = -1  # Just has to be different intially than prev
                action_id = -1
                eval_num_episodes = 0

                eval_total_reward = 0
                eval_num_episodes = 0
                eval_num_wins = 0
                eval_num_rewards = 0
                eval_episode_max_reward = 0
                eval_episode_reward = 0
                eval_actions = np.zeros(self.game.num_actions)

                # Initialize new game without random start moves
                screen, reward, done = self.game.new_game()

                for _ in range(self.params.history_length):
                    self.history.add(screen)

                #for eval_iterations in range(self.params.eval_iterations):
                while eval_num_episodes < self.params.eval_iterations:  # Play eval_iterations games
                    prev_action_id = action_id

                    feed_dict_eval = {
                        self.dqn_train.pl_screens: self.history.get()
                    }
                    qvalues = sess.run(self.dqn_train.qvalues,
                                       feed_dict=feed_dict_eval)

                    # Choose the best action based on the approximated Q-values
                    qvalue_max = np.max(qvalues[0])
                    action_id = np.argmax(qvalues[0])

                    # Skip this action if we are in the same game
                    if prev_action_id == action_id and prev_episode_num == eval_num_episodes:
                        action_id = random.randrange(self.game.num_actions)

                    prev_episode_num = eval_num_episodes

                    # Perform the action
                    screen, reward, done = self.game.act(action_id)
                    self.history.add(screen)

                    eval_episode_reward += reward
                    if reward > 0:
                        eval_num_rewards += 1

                    if reward == self.game.env.rewards["win"]:
                        eval_num_wins += 1

                    if done:
                        # Note max reward is from playin gthe games
                        eval_total_reward += eval_episode_reward
                        eval_episode_max_reward = max(eval_episode_reward,
                                                      eval_episode_max_reward)
                        eval_episode_reward = 0
                        eval_num_episodes += 1

                        screen, reward, done = self.game.new_game()
                        for _ in range(self.params.history_length):
                            self.history.add(screen)

                current_win_rate = (eval_num_wins / eval_num_episodes) * 100

                print("  Win Rate: %.2f" % (current_win_rate))

                if current_win_rate > best_win_rate:
                    best_win_rate = current_win_rate
                    best_model = current_name

                current_name = current_name + 20000

            print("Best model is: ", best_model)
Exemplo n.º 18
0
class DQN:
    def __init__(self,
                 config,
                 game,
                 directory,
                 callback=None,
                 summary_writer=None):

        self.game = game
        self.actions = game.get_available_actions()
        self.feedback_size = game.get_feedback_size()
        self.callback = callback
        self.summary_writer = summary_writer

        self.config = config
        self.batch_size = config['batch_size']
        self.n_episode = config['num_episode']
        self.capacity = config['capacity']
        self.epsilon_decay = config['epsilon_decay']
        self.epsilon_min = config['epsilon_min']
        self.num_frames = config['num_frames']
        self.num_nullops = config['num_nullops']
        self.time_between_two_copies = config['time_between_two_copies']
        self.input_scale = config['input_scale']
        self.update_interval = config['update_interval']
        self.directory = directory

        self._init_modules()

    def _init_modules(self):

        # Replay memory
        self.replay_memory = ReplayMemory(history_len=self.num_frames,
                                          capacity=self.capacity,
                                          batch_size=self.batch_size,
                                          input_scale=self.input_scale)

        input_shape = self.feedback_size + (self.num_frames, )
        # Q-network
        self.q_network = QNetwork(input_shape=input_shape,
                                  n_outputs=len(self.actions),
                                  network_type=self.config['network_type'],
                                  scope='q_network')
        # Target network
        self.target_network = QNetwork(
            input_shape=input_shape,
            n_outputs=len(self.actions),
            network_type=self.config['network_type'],
            scope='target_network')
        # Optimizer
        self.optimizer = Optimizer(config=self.config,
                                   feedback_size=self.feedback_size,
                                   q_network=self.q_network,
                                   target_network=self.target_network,
                                   replay_memory=self.replay_memory)
        # Ops for updating target network
        self.clone_op = self.target_network.get_clone_op(self.q_network)
        # For tensorboard
        self.t_score = tf.placeholder(dtype=tf.float32,
                                      shape=[],
                                      name='new_score')
        tf.summary.scalar("score", self.t_score, collections=['dqn'])
        self.summary_op = tf.summary.merge_all('dqn')

    def set_summary_writer(self, summary_writer=None):
        self.summary_writer = summary_writer
        self.optimizer.set_summary_writer(summary_writer)

    def choose_action(self, sess, state, epsilon_greedy):
        if numpy.random.binomial(1, epsilon_greedy) == 1:
            action = random.choice(self.actions)
        else:
            x = numpy.asarray(numpy.expand_dims(state, axis=0) /
                              self.input_scale,
                              dtype=numpy.float32)
            action = self.q_network.get_q_action(sess, x)[0]
        return action

    def play(self, action):
        r, new_state, termination = self.game.play_action(action)
        return r, new_state, termination

    def update_target_network(self, sess):
        sess.run(self.clone_op)

    def train(self, sess, saver=None):

        num_of_trials = -1
        for episode in range(self.n_episode):
            self.game.reset()
            frame = self.game.get_current_feedback()
            for _ in range(self.num_nullops):
                r, new_frame, termination = self.play(action=0)
                self.replay_memory.add(frame, 0, r, termination)
                frame = new_frame

            for _ in range(self.config['T']):
                num_of_trials += 1
                epsilon_greedy = self.epsilon_min + \
                    max(self.epsilon_decay - num_of_trials, 0) / \
                    self.epsilon_decay * (1 - self.epsilon_min)
                print("epi {}, frame {}k: reward {}, eps {}".format(
                    episode, int(num_of_trials / 1000),
                    self.game.get_total_reward(), epsilon_greedy))
                if num_of_trials % self.update_interval == 0:
                    self.optimizer.train_one_step(sess, num_of_trials,
                                                  self.batch_size)

                state = self.replay_memory.phi(frame)
                action = self.choose_action(sess, state, epsilon_greedy)
                r, new_frame, termination = self.play(action)
                self.replay_memory.add(frame, action, r, termination)
                frame = new_frame

                if num_of_trials % self.time_between_two_copies == 0:
                    self.update_target_network(sess)
                    self.save(sess, saver)

                if self.callback:
                    self.callback()
                if termination:
                    score = self.game.get_total_reward()
                    summary_str = sess.run(self.summary_op,
                                           feed_dict={self.t_score: score})
                    self.summary_writer.add_summary(summary_str, num_of_trials)
                    self.summary_writer.flush()
                    break

    def evaluate(self, sess):

        for episode in range(self.n_episode):
            self.game.reset()
            frame = self.game.get_current_feedback()
            for _ in range(self.num_nullops):
                r, new_frame, termination = self.play(action=0)
                self.replay_memory.add(frame, 0, r, termination)
                frame = new_frame

            for _ in range(self.config['T']):
                print("episode {}, total reward {}".format(
                    episode, self.game.get_total_reward()))

                state = self.replay_memory.phi(frame)
                action = self.choose_action(sess, state, self.epsilon_min)
                r, new_frame, termination = self.play(action)
                self.replay_memory.add(frame, action, r, termination)
                frame = new_frame

                if self.callback:
                    self.callback()
                    if termination:
                        break

    def save(self, sess, saver, model_name='model.ckpt'):
        if saver:
            try:
                checkpoint_path = os.path.join(self.directory, model_name)
                saver.save(sess, checkpoint_path)
            except:
                pass

    def load(self, sess, saver, model_name='model.ckpt'):
        if saver:
            try:
                checkpoint_path = os.path.join(self.directory, model_name)
                saver.restore(sess, checkpoint_path)
            except:
                pass
Exemplo n.º 19
0
class Agent(BaseModel):
    def __init__(self, config, environment, sess):
        super(Agent, self).__init__(config)
        self.sess = sess

        self.env = environment
        self.history = History(self.config)
        self.memory = ReplayMemory(self.config, self.checkpoint_dir)

        with tf.variable_scope('step'):
            self.step_op = tf.Variable(0, trainable=False, name='step')
            self.step_input = tf.placeholder('int32', None, name='step_input')
            self.step_assign_op = self.step_op.assign(self.step_input)

        self.build_dqn()

    def train(self):
        start_step = self.step_op.eval()
        start_time = time.time()

        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        ep_rewards, actions = [], []

        screen, reward, action, terminal = self.env.new_random_game()
        for _ in range(self.history_length):
            self.history.add(screen)

        for self.step in tqdm(range(start_step, self.max_step),
                              ncols=72,
                              initial=start_step):
            if self.step == self.learn_start:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            # 1. predict
            action = self.predict(self.history.get())
            # 2. act
            screen, reward, terminal = self.env.act(action, is_training=True)
            # 3. observe
            self.observe(screen, reward, action, terminal)

            if terminal:
                screen, reward, action, terminal = self.env.new_random_game()

                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.
            else:
                ep_reward += reward

            actions.append(action)
            total_reward += reward

            if self.step > self.learn_start:
                if self.step % self.test_step == 0:
                    avg_reward = total_reward / self.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    print(
                        '\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d'
                        % (avg_reward, avg_loss, avg_q, avg_ep_reward,
                           max_ep_reward, min_ep_reward, num_game))

                    self.step_assign_op.eval({self.step_input: self.step + 1})
                    self.save_model(self.step + 1)

                    if self.step > 180:
                        self.inject_summary(
                            {
                                'average.reward':
                                avg_reward,
                                'average.loss':
                                avg_loss,
                                'average.q':
                                avg_q,
                                'episode.max reward':
                                max_ep_reward,
                                'episode.min reward':
                                min_ep_reward,
                                'episode.avg reward':
                                avg_ep_reward,
                                'episode.num of game':
                                num_game,
                                'episode.rewards':
                                ep_rewards,
                                'episode.actions':
                                actions,
                                'training.learning_rate':
                                self.learning_rate_op.eval(
                                    {self.learning_rate_step: self.step}),
                            }, self.step)

                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

    def predict(self, s_t, test_ep=None):
        ep = test_ep or (self.ep_end + max(
            0., (self.ep_start - self.ep_end) *
            (self.ep_end_t - max(0., self.step - self.learn_start)) /
            self.ep_end_t))

        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            action = self.q_action.eval({self.s_t: [s_t]})[0]

        return action

    def observe(self, screen, reward, action, terminal):
        reward = max(self.min_reward, min(self.max_reward, reward))

        self.history.add(screen)
        self.memory.add(screen, reward, action, terminal)

        if self.step > self.learn_start:
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()

            if self.step % self.target_q_update_step == self.target_q_update_step - 1:
                self.update_target_q_network()

    def q_learning_mini_batch(self):
        if self.memory.count < self.history_length:
            return
        else:
            s_t, action, reward, s_t_plus_1, terminal = self.memory.sample()

        t = time.time()
        if self.double_q:
            # Double Q-learning
            pred_action = self.q_action.eval({self.s_t: s_t_plus_1})

            q_t_plus_1_with_pred_action = self.target_q_with_idx.eval({
                self.target_s_t:
                s_t_plus_1,
                self.target_q_idx:
                [[idx, pred_a] for idx, pred_a in enumerate(pred_action)]
            })
            target_q_t = (1. - terminal) * self.discount * \
                q_t_plus_1_with_pred_action + reward
        else:
            q_t_plus_1 = self.target_q.eval({self.target_s_t: s_t_plus_1})

            terminal = np.array(terminal) + 0.
            max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
            target_q_t = (1. - terminal) * self.discount * \
                max_q_t_plus_1 + reward

        _, q_t, loss, summary_str = self.sess.run(
            [self.optim, self.q, self.loss, self.q_summary], {
                self.target_q_t: target_q_t,
                self.action: action,
                self.s_t: s_t,
                self.learning_rate_step: self.step,
            })

        self.writer.add_summary(summary_str, self.step)
        self.total_loss += loss
        self.total_q += q_t.mean()
        self.update_count += 1

    def build_dqn(self):
        self.w = {}
        self.t_w = {}

        # initializer = tf.contrib.layers.xavier_initializer()
        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu

        # training network
        with tf.variable_scope('prediction'):
            if self.cnn_format == 'NHWC':
                self.s_t = tf.placeholder('float32', [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                          name='s_t')
            else:
                self.s_t = tf.placeholder('float32', [
                    None, self.history_length, self.screen_height,
                    self.screen_width
                ],
                                          name='s_t')

            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t,
                                                             32, [8, 8],
                                                             [4, 4],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name='l1')
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1,
                                                             64, [4, 4],
                                                             [2, 2],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name='l2')
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2,
                                                             64, [3, 3],
                                                             [1, 1],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name='l3')

            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(
                self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

            if self.dueling:
                self.value_hid, self.w['l4_val_w'], self.w['l4_val_b'] = \
                    linear(self.l3_flat, 512,
                           activation_fn=activation_fn, name='value_hid')

                self.adv_hid, self.w['l4_adv_w'], self.w['l4_adv_b'] = \
                    linear(self.l3_flat, 512,
                           activation_fn=activation_fn, name='adv_hid')

                self.value, self.w['val_w_out'], self.w['val_w_b'] = \
                    linear(self.value_hid, 1, name='value_out')

                self.advantage, self.w['adv_w_out'], self.w['adv_w_b'] = \
                    linear(self.adv_hid, self.env.action_size, name='adv_out')

                # Average Dueling
                self.q = self.value + (self.advantage - tf.reduce_mean(
                    self.advantage, reduction_indices=1, keep_dims=True))
            else:
                self.l4, self.w['l4_w'], self.w['l4_b'] = linear(
                    self.l3_flat, 512, activation_fn=activation_fn, name='l4')
                self.q, self.w['q_w'], self.w['q_b'] = linear(
                    self.l4, self.env.action_size, name='q')

            self.q_action = tf.argmax(self.q, dimension=1)

            q_summary = []
            avg_q = tf.reduce_mean(self.q, 0)
            for idx in range(self.env.action_size):
                q_summary.append(tf.histogram_summary('q/%s' % idx,
                                                      avg_q[idx]))
            self.q_summary = tf.merge_summary(q_summary, 'q_summary')

        # target network
        with tf.variable_scope('target'):
            if self.cnn_format == 'NHWC':
                self.target_s_t = tf.placeholder('float32', [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                                 name='target_s_t')
            else:
                self.target_s_t = tf.placeholder('float32', [
                    None, self.history_length, self.screen_height,
                    self.screen_width
                ],
                                                 name='target_s_t')

            self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d(
                self.target_s_t,
                32, [8, 8], [4, 4],
                initializer,
                activation_fn,
                self.cnn_format,
                name='target_l1')
            self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d(
                self.target_l1,
                64, [4, 4], [2, 2],
                initializer,
                activation_fn,
                self.cnn_format,
                name='target_l2')
            self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d(
                self.target_l2,
                64, [3, 3], [1, 1],
                initializer,
                activation_fn,
                self.cnn_format,
                name='target_l3')

            shape = self.target_l3.get_shape().as_list()
            self.target_l3_flat = tf.reshape(
                self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

            if self.dueling:
                self.t_value_hid, self.t_w['l4_val_w'], self.t_w['l4_val_b'] = \
                    linear(self.target_l3_flat, 512,
                           activation_fn=activation_fn, name='target_value_hid')

                self.t_adv_hid, self.t_w['l4_adv_w'], self.t_w['l4_adv_b'] = \
                    linear(self.target_l3_flat, 512,
                           activation_fn=activation_fn, name='target_adv_hid')

                self.t_value, self.t_w['val_w_out'], self.t_w['val_w_b'] = \
                    linear(self.t_value_hid, 1, name='target_value_out')

                self.t_advantage, self.t_w['adv_w_out'], self.t_w['adv_w_b'] = \
                    linear(self.t_adv_hid, self.env.action_size,
                           name='target_adv_out')

                # Average Dueling
                self.target_q = self.t_value + (
                    self.t_advantage - tf.reduce_mean(
                        self.t_advantage, reduction_indices=1, keep_dims=True))
            else:
                self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \
                    linear(self.target_l3_flat, 512,
                           activation_fn=activation_fn, name='target_l4')
                self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \
                    linear(self.target_l4, self.env.action_size, name='target_q')

            self.target_q_idx = tf.placeholder('int32', [None, None],
                                               'outputs_idx')
            self.target_q_with_idx = tf.gather_nd(self.target_q,
                                                  self.target_q_idx)

        with tf.variable_scope('pred_to_target'):
            self.t_w_input = {}
            self.t_w_assign_op = {}

            for name in self.w.keys():
                self.t_w_input[name] = tf.placeholder(
                    'float32', self.t_w[name].get_shape().as_list(), name=name)
                self.t_w_assign_op[name] = self.t_w[name].assign(
                    self.t_w_input[name])

        # optimizer
        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder('float32', [None],
                                             name='target_q_t')
            self.action = tf.placeholder('int64', [None], name='action')

            action_one_hot = tf.one_hot(self.action,
                                        self.env.action_size,
                                        1.0,
                                        0.0,
                                        name='action_one_hot')
            q_acted = tf.reduce_sum(self.q * action_one_hot,
                                    reduction_indices=1,
                                    name='q_acted')

            self.delta = self.target_q_t - q_acted
            self.clipped_delta = tf.clip_by_value(self.delta,
                                                  self.min_delta,
                                                  self.max_delta,
                                                  name='clipped_delta')

            self.global_step = tf.Variable(0, trainable=False)

            self.loss = tf.reduce_mean(tf.square(self.clipped_delta),
                                       name='loss')
            self.learning_rate_step = tf.placeholder('int64',
                                                     None,
                                                     name='learning_rate_step')
            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                   momentum=0.95,
                                                   epsilon=0.01).minimize(
                                                       self.loss)

        with tf.variable_scope('summary'):
            scalar_summary_tags = [
                'average.reward', 'average.loss', 'average.q',
                'episode.max reward', 'episode.min reward',
                'episode.avg reward', 'episode.num of game',
                'training.learning_rate'
            ]

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(
                    'float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = tf.scalar_summary(
                    "%s/%s" % (self.env_name, tag),
                    self.summary_placeholders[tag])

            histogram_summary_tags = ['episode.rewards', 'episode.actions']

            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(
                    'float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = tf.histogram_summary(
                    tag, self.summary_placeholders[tag])

            self.writer = tf.train.SummaryWriter('./logs/%s' % self.model_dir,
                                                 self.sess.graph)

        tf.initialize_all_variables().run()

        self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op],
                                     max_to_keep=30)

        self.load_model()
        self.update_target_q_network()

    def update_target_q_network(self):
        for name in self.w.keys():
            self.t_w_assign_op[name].eval(
                {self.t_w_input[name]: self.w[name].eval()})

    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run(
            [self.summary_ops[tag] for tag in tag_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in tag_dict.items()
            })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, self.step)

    def play(self, n_step=10000, n_episode=100, test_ep=None, render=False):
        if test_ep == None:
            test_ep = self.ep_end

        test_history = History(self.config)

        if not self.display:
            gym_dir = '/tmp/%s-%s' % (self.env_name, get_time())
            self.env.env.monitor.start(gym_dir)

        best_reward, best_idx = 0, 0
        for idx in range(n_episode):
            screen, reward, action, terminal = self.env.new_random_game()
            current_reward = 0

            for _ in range(self.history_length):
                test_history.add(screen)

            for t in tqdm(range(n_step), ncols=72):
                # 1. predict
                action = self.predict(test_history.get(), test_ep)
                # 2. act
                screen, reward, terminal = self.env.act(action,
                                                        is_training=False)
                # 3. observe
                test_history.add(screen)

                current_reward += reward
                if terminal:
                    break

            if current_reward > best_reward:
                best_reward = current_reward
                best_idx = idx

            print("=" * 30)
            print(" [%d] Best reward : %d" % (best_idx, best_reward))
            print("=" * 30)

        if not self.display:
            self.env.env.monitor.close()

    def save_model(self, step=None):
        super(Agent, self).save_model(step)
        self.memory.save()

    def load_model(self):
        if super(Agent, self).load_model():
            # Only try to load the replay memory if we successfully loaded
            # a checkpoint file.
            self.memory.load()
Exemplo n.º 20
0
class neonDQN(object):
    def __init__(self, input_shape, action_space):
        self._debug = 0
        self.mode = 'train'
        self.input_shape = input_shape
        self.action_space = action_space
        self.prev_action = action_space.sample()
        self.action_space_size = action_space.n
        self.steps = 0
        self.prelearning_steps = 50000  #50000
        self.total_steps = 10000  #1000000
        self.history_length = input_shape[0]
        self.history_step = 0
        self.observation_buffer = np.zeros(input_shape)
        # self.prev_state = np.zeros(input_shape[1:])
        # learning related
        self.learning_rate = 0.00025
        self.rmsprop_gamma2 = 1
        # experience replay related
        self.memoryIdx = 0
        self.memoryFillCount = 0
        self.memoryLimit = 50000  #1000000
        self.sampleSize = 32

        self.states = np.zeros((self.memoryLimit, ) + self.input_shape[1:],
                               dtype='uint8')
        self.actions = np.zeros((self.memoryLimit, ), dtype='uint8')
        self.rewards = np.zeros((self.memoryLimit, ))
        self.nextStates = np.zeros_like(self.states, dtype='uint8')
        self.dones = np.zeros_like(self.actions, dtype='bool')
        # target network update related
        self.targetNetC = 4  #10000
        # Q learning related
        self.gamma = 0.99

        #build Q-learning networks
        print "building network......"
        self.args = self.generate_parameter()
        self.net = self.build_network(self.args)
        self.mem = ReplayMemory(self.memoryLimit, self.args)

        np.set_printoptions(precision=4, suppress=True)

    def act(self, observation):
        observation = self.preprocess_state(observation)
        self.observation_buffer[:-1, ...] = self.observation_buffer[1:, ...]
        self.observation_buffer[-1, ...] = observation

        if self.mode == 'train':
            epsilon = max(
                0.1, 1 -
                max(self.steps - self.prelearning_steps, 0) / self.total_steps)
        elif self.mode == 'test':
            epsilon = .05
        else:
            assert False

        action = self.choose_action(self.observation_buffer, epsilon)
        return action

    def observe(self, state, action, reward, nextState, done):
        if self.mode == 'test':
            return

        state = self.preprocess_state(state)
        # self.prev_state = state
        nextState = self.preprocess_state(nextState)
        # self.prev_state = nextState

        self.steps += 1
        # ==========================================================
        # plt.figure(2)
        # plt.subplot(3, 1, 1)
        # plt.imshow(state)
        # plt.title("action: " + str(action) + "reward: " + str(reward)
        #           + "done: " + str(done))
        # plt.colorbar()
        # plt.subplot(3, 1, 2)
        # plt.imshow(nextState)
        # plt.subplot(3, 1, 3)
        # plt.imshow(nextState.astype('int16') - state)
        # plt.colorbar()
        # plt.show()
        # ==========================================================
        self.putInMemory(state, action, reward, nextState, done)
        # ==========================================================
        self.mem.add(action, reward, nextState, done)
        # ==========================================================

        if self.steps - self.prelearning_steps > 0:  # learning starts

            # state, action, reward, nextState, done = self.sampleFromMemory()
            # ==========================================================
            state, action, reward, nextState, done = self.mem.getMinibatch()
            # ==========================================================
            self.train(state, action, reward, nextState, done)

    def preprocess_state(self, state):
        # state_resize = imresize(state, (84, 84, 3))
        # state_resize_gray = np.mean(state_resize, axis=2)
        # max_state = np.maximum(prev_state, state_resize_gray)
        # return max_state.astype('uint8')
        state = cv2.resize(cv2.cvtColor(state, cv2.COLOR_RGB2GRAY),
                           self.input_shape[1:])
        return state

    def putInMemory(self, state, action, reward, nextState, done):
        memoryIdx = self.memoryIdx
        self.states[memoryIdx, ...] = state
        self.actions[memoryIdx, ...] = action
        self.rewards[memoryIdx, ...] = reward
        self.nextStates[memoryIdx, ...] = nextState
        self.dones[memoryIdx, ...] = done

        self.memoryIdx += 1
        self.memoryFillCount = max(self.memoryFillCount, self.memoryIdx)
        assert self.memoryFillCount <= self.memoryLimit
        self.memoryIdx = self.memoryIdx % self.memoryLimit

    def sampleFromMemory(self):
        # sampleIdx = np.random.permutation(self.memoryLimit)
        # sampleIdx = sampleIdx[:self.sampleSize]
        #
        # state = np.zeros((self.sampleSize,) + self.states.shape[1:])
        # action = np.zeros((self.sampleSize,) + self.actions.shape[1:], dtype='int')
        # reward = np.zeros((self.sampleSize,) + self.rewards.shape[1:])
        # nextState = np.zeros((self.sampleSize,) + self.nextStates.shape[1:])
        # done = np.zeros((self.sampleSize,) + self.dones.shape[1:], dtype='int')
        #
        # for i in xrange(self.sampleSize):
        #     state[i] = self.states[sampleIdx[i]]
        #     action[i] = self.actions[sampleIdx[i]]
        #     reward[i] = self.rewards[sampleIdx[i]]
        #     nextState[i] = self.nextStates[sampleIdx[i]]
        #     done[i] = self.dones[sampleIdx[i]]
        #
        # return state, action, reward, nextState, done
        #==================================================================================================
        state = np.zeros(
            (self.sampleSize, self.history_length) + self.states.shape[1:],
            dtype='uint8')
        nextState = np.zeros(
            (self.sampleSize, self.history_length) + self.nextStates.shape[1:],
            dtype='uint8')
        indexes = []
        while len(indexes) < self.sampleSize:
            # find random index
            while True:
                # sample one index (ignore states wraping over
                index = random.randint(self.history_length - 1,
                                       self.memoryFillCount - 1)
                # if wraps over current pointer, then get new one
                if index >= self.memoryIdx and index - (self.history_length -
                                                        1) < self.memoryIdx:
                    continue
                # if wraps over episode end, then get new one
                # NB! poststate (last screen) can be terminal state!
                if self.dones[(index - self.history_length + 1):index].any():
                    continue
                # if (self.rewards[(index - self.history_length + 1):index] != 0).any():
                #     continue
                # otherwise use this index
                break

            # NB! having index first is fastest in C-order matrices
            assert index >= self.history_length - 1
            assert index <= self.memoryLimit - 1
            state[len(indexes),
                  ...] = self.states[(index -
                                      (self.history_length - 1)):(index + 1),
                                     ...]
            nextState[len(indexes), ...] = self.nextStates[(
                index - (self.history_length - 1)):(index + 1), ...]
            indexes.append(index)

        # copy actions, rewards and terminals with direct slicing
        action = self.actions[indexes]
        reward = self.rewards[indexes]
        done = self.dones[indexes]
        return state, action, reward, nextState, done

    def build_network(self, args):
        net = DeepQNetwork(self.action_space_size, args)
        return net

    def choose_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            return self.action_space.sample()
        else:
            return self.greedy(state)

    def greedy(self, state):
        # predict the Q values at current state
        state = state[np.newaxis, :]
        #replicate by batch_size
        state = np.tile(state, (self.sampleSize, 1, 1, 1))

        # ======================================================
        q = self.net.predict(state)
        #======================================================
        # q = self._network_forward(self.network, state)
        # ======================================================

        q = q[0, :]
        # return the index of maximum Q value
        return np.argmax(q)

    def _network_forward(self, net, state):
        assert state.shape[0] == self.sampleSize
        assert state.shape[1] == self.input_shape[0]

        state = state / 255.0
        arg_arrays = net.arg_dict
        train_iter = mx.io.NDArrayIter(data=state, batch_size=state.shape[0])
        data = arg_arrays[train_iter.provide_data[0][0]]

        q = []
        for batch in train_iter:
            # Copy data to executor input. Note the [:].
            data[:] = batch.data[0]

            self.network.forward(is_train=False)

            q = self.network.outputs[0]

        return q.asnumpy()

    def train(self, state, action, reward, nextState, done):
        epoch = 0
        minibatch = state, action, reward, nextState, done
        self.net.train(minibatch, epoch)
        # reward = np.clip(reward, -1, 1)
        #
        #
        # future_Qvalue = self._network_forward(self.targetNetwork, nextState)
        # future_reward = np.max(future_Qvalue, axis=1)
        # future_reward = future_reward[:, np.newaxis]
        #
        # nonzero_reward_list = np.nonzero(reward)
        # # reward += (1-done)*self.gamma*future_reward
        # reward += (1-abs(reward))*self.gamma*future_reward
        #
        # target_reward = self._network_forward(self.network, state)
        # old_target_reward = copy.deepcopy(target_reward)
        # for i in xrange(self.sampleSize):
        #     # target_reward[i][action[i]] = reward[i]
        #     # clip error to [-1, 1], Mnih 2015 Nature
        #     target_reward[i][action[i]] = max(min(reward[i], target_reward[i][action[i]]+1), target_reward[i][action[i]]-1)
        #
        # #=======================================================================
        # if self._debug:
        #     print "reward:", reward.transpose()
        #     print "future_reward:", future_reward.transpose()
        #     print "action:", action.transpose()
        #     print "done: ", done.transpose()
        #     figure_id = 0
        #     for batch_i in nonzero_reward_list[0]:
        #         if 1: #reward[batch_i, ...] != 0:
        #             figure_id += 1
        #             plt.figure(figure_id)
        #             for plot_i in range(0, self.history_length):
        #                 plt.subplot(3, self.history_length, plot_i + 1)
        #                 plt.imshow(state[batch_i, plot_i, ...])
        #                 plt.title("action: " + str(action[batch_i, ...]) + "reward: " + str(reward[batch_i, ...])
        #                           + "done: " + str(done[batch_i, ...]))
        #                 plt.colorbar()
        #
        #                 plt.subplot(3, self.history_length, plot_i + 1 + self.history_length)
        #                 plt.imshow(nextState[batch_i, plot_i, ...])
        #
        #                 plt.subplot(3, self.history_length, plot_i + 1 + self.history_length * 2)
        #                 plt.imshow(nextState[batch_i, plot_i, ...].astype('int16') - state[batch_i, plot_i, ...])
        #                 if plot_i == 0:
        #                     plt.title("reward: " + str(reward[batch_i, ...])
        #                           + " target reward: " + str(target_reward[batch_i, ...])
        #                           + " old reward: " + str(old_target_reward[batch_i, ...]))
        #                 plt.colorbar()
        #
        #     plt.show()
        #     # raw_input()
        # #=======================================================================
        #
        # train_data = state / 255.0
        # train_label = target_reward
        #
        #
        # # First we get handle to input arrays
        # arg_arrays = self.network.arg_dict
        # batch_size = self.sampleSize
        # train_iter = mx.io.NDArrayIter(data=train_data, label=train_label, batch_size=batch_size, shuffle=False)
        # # val_iter = mx.io.NDArrayIter(data=val_data, label=val_label, batch_size=batch_size)
        # data = arg_arrays[train_iter.provide_data[0][0]]
        # label = arg_arrays[train_iter.provide_label[0][0]]
        #
        # # opt = mx.optimizer.RMSProp(
        # #     learning_rate= self.learning_rate,
        # #     gamma2 = self.rmsprop_gamma2)
        #
        # opt = mx.optimizer.Adam(
        #     learning_rate=self.learning_rate)
        #
        # updater = mx.optimizer.get_updater(opt)
        #
        # # Finally we need a metric to print out training progress
        # metric = mx.metric.MSE()
        #
        # # Training loop begines
        # train_iter.reset()
        # metric.reset()
        #
        # for batch in train_iter:
        #     # Copy data to executor input. Note the [:].
        #     data[:] = batch.data[0]
        #     label[:] = batch.label[0]
        #
        #     # Forward
        #     self.network.forward(is_train=True)
        #
        #     # You perform operations on exe.outputs here if you need to.
        #     # For example, you can stack a CRF on top of a neural network.
        #
        #     # Backward
        #     self.network.backward()
        #
        #     # Update
        #     for i, pair in enumerate(zip(self.network.arg_arrays, self.network.grad_arrays)):
        #         weight, grad = pair
        #         updater(i, grad, weight)
        #     metric.update(batch.label, self.network.outputs)
        #
        #     if self.steps % 1000 == 0:
        #         print 'steps:', self.steps, 'metric:', metric.get()
        #         print 'network.outputs:', self.network.outputs[0].asnumpy()
        #         print 'label:', batch.label[0].asnumpy()
        #         # np.set_printoptions(precision=4)
        #         print 'delta: ', (batch.label[0].asnumpy() - self.network.outputs[0].asnumpy())
        # # t = 0
        # # metric.reset()
        # # for batch in val_iter:
        # #     # Copy data to executor input. Note the [:].
        # #     data[:] = batch.data[0]
        # #     label[:] = batch.label[0]
        # #
        # #     # Forward
        # #     self.network.forward(is_train=False)
        # #     metric.update(batch.label, self.network.outputs)
        # #     t += 1
        # #     if t % 50 == 0:
        # #         print 'epoch:', epoch, 'test iter:', t, 'metric:', metric.get()
        #
        # #========================================================================
        # #sync target-network with network as mentioned in Mnih et al. Nature 2015
        if self.steps % self.targetNetC == 0:
            self.net.update_target_network()
        #     self.targetNetwork.copy_params_from(self.network.arg_dict, self.network.aux_dict)

    # Basic Conv + BN + ReLU factory
    def ConvFactory(self,
                    data,
                    num_filter,
                    kernel,
                    stride=(1, 1),
                    pad=(0, 0),
                    act_type="relu"):
        # there is an optional parameter ```wrokshpace``` may influece convolution performance
        # default, the workspace is set to 256(MB)
        # you may set larger value, but convolution layer only requires its needed but not exactly
        # MXNet will handle reuse of workspace without parallelism conflict
        conv = mx.symbol.Convolution(data=data,
                                     workspace=256,
                                     num_filter=num_filter,
                                     kernel=kernel,
                                     stride=stride,
                                     pad=pad)
        # bn = mx.symbol.BatchNorm(data=conv)
        act = mx.symbol.Activation(data=conv, act_type=act_type)
        return act

    def generate_parameter(self):
        def str2bool(v):
            return v.lower() in ("yes", "true", "t", "1")

        parser = argparse.ArgumentParser()

        envarg = parser.add_argument_group('Environment')
        envarg.add_argument(
            "--game",
            default="Catcher-v0",
            help=
            "ROM bin file or env id such as Breakout-v0 if training with Open AI Gym."
        )
        envarg.add_argument(
            "--environment",
            choices=["ale", "gym"],
            default="ale",
            help="Whether to train agent using ALE or OpenAI Gym.")
        envarg.add_argument(
            "--display_screen",
            type=str2bool,
            default=False,
            help="Display game screen during training and testing.")
        # envarg.add_argument("--sound", type=str2bool, default=False, help="Play (or record) sound.")
        envarg.add_argument(
            "--frame_skip",
            type=int,
            default=4,
            help="How many times to repeat each chosen action.")
        envarg.add_argument(
            "--repeat_action_probability",
            type=float,
            default=0,
            help=
            "Probability, that chosen action will be repeated. Otherwise random action is chosen during repeating."
        )
        envarg.add_argument("--minimal_action_set",
                            dest="minimal_action_set",
                            type=str2bool,
                            default=True,
                            help="Use minimal action set.")
        envarg.add_argument(
            "--color_averaging",
            type=str2bool,
            default=True,
            help="Perform color averaging with previous frame.")
        envarg.add_argument("--screen_width",
                            type=int,
                            default=64,
                            help="Screen width after resize.")
        envarg.add_argument("--screen_height",
                            type=int,
                            default=64,
                            help="Screen height after resize.")
        envarg.add_argument(
            "--record_screen_path",
            default="./",
            help=
            "Record game screens under this path. Subfolder for each game is created."
        )
        envarg.add_argument("--record_sound_filename",
                            default="./",
                            help="Record game sound in this file.")

        memarg = parser.add_argument_group('Replay memory')
        memarg.add_argument("--replay_size",
                            type=int,
                            default=50000,
                            help="Maximum size of replay memory.")
        memarg.add_argument("--history_length",
                            type=int,
                            default=4,
                            help="How many screen frames form a state.")

        netarg = parser.add_argument_group('Deep Q-learning network')
        netarg.add_argument("--learning_rate",
                            type=float,
                            default=0.00025,
                            help="Learning rate.")
        netarg.add_argument("--discount_rate",
                            type=float,
                            default=0.99,
                            help="Discount rate for future rewards.")
        netarg.add_argument("--batch_size",
                            type=int,
                            default=32,
                            help="Batch size for neural network.")
        netarg.add_argument('--optimizer',
                            choices=['rmsprop', 'adam', 'adadelta'],
                            default='rmsprop',
                            help='Network optimization algorithm.')
        netarg.add_argument(
            "--decay_rate",
            type=float,
            default=0.95,
            help="Decay rate for RMSProp and Adadelta algorithms.")
        netarg.add_argument(
            "--clip_error",
            type=float,
            default=1,
            help=
            "Clip error term in update between this number and its negative.")
        netarg.add_argument("--min_reward",
                            type=float,
                            default=-1,
                            help="Minimum reward.")
        netarg.add_argument("--max_reward",
                            type=float,
                            default=1,
                            help="Maximum reward.")
        netarg.add_argument("--batch_norm",
                            type=str2bool,
                            default=False,
                            help="Use batch normalization in all layers.")

        # netarg.add_argument("--rescale_r", type=str2bool, help="Rescale rewards.")
        # missing: bufferSize=512,valid_size=500,min_reward=-1,max_reward=1

        neonarg = parser.add_argument_group('Neon')
        neonarg.add_argument('--backend',
                             choices=['cpu', 'gpu'],
                             default='gpu',
                             help='backend type')
        neonarg.add_argument('--device_id',
                             type=int,
                             default=0,
                             help='gpu device id (only used with GPU backend)')
        neonarg.add_argument(
            '--datatype',
            choices=['float16', 'float32', 'float64'],
            default='float32',
            help=
            'default floating point precision for backend [f64 for cpu only]')
        neonarg.add_argument(
            '--stochastic_round',
            const=True,
            type=int,
            nargs='?',
            default=False,
            help=
            'use stochastic rounding [will round to BITS number of bits if specified]'
        )

        antarg = parser.add_argument_group('Agent')
        antarg.add_argument("--exploration_rate_start",
                            type=float,
                            default=1,
                            help="Exploration rate at the beginning of decay.")
        antarg.add_argument("--exploration_rate_end",
                            type=float,
                            default=0.1,
                            help="Exploration rate at the end of decay.")
        antarg.add_argument(
            "--exploration_decay_steps",
            type=float,
            default=10000,
            help="How many steps to decay the exploration rate.")
        antarg.add_argument("--exploration_rate_test",
                            type=float,
                            default=0.05,
                            help="Exploration rate used during testing.")
        antarg.add_argument(
            "--train_frequency",
            type=int,
            default=4,
            help="Perform training after this many game steps.")
        antarg.add_argument(
            "--train_repeat",
            type=int,
            default=1,
            help="Number of times to sample minibatch during training.")
        antarg.add_argument(
            "--target_steps",
            type=int,
            default=4,
            help=
            "Copy main network to target network after this many game steps.")
        antarg.add_argument(
            "--random_starts",
            type=int,
            default=30,
            help=
            "Perform max this number of dummy actions after game restart, to produce more random game dynamics."
        )

        nvisarg = parser.add_argument_group('Visualization')
        nvisarg.add_argument(
            "--visualization_filters",
            type=int,
            default=4,
            help="Number of filters to visualize from each convolutional layer."
        )
        nvisarg.add_argument("--visualization_file",
                             default="tmp",
                             help="Write layer visualization to this file.")

        mainarg = parser.add_argument_group('Main loop')
        mainarg.add_argument(
            "--random_steps",
            type=int,
            default=50000,
            help=
            "Populate replay memory with random steps before starting learning."
        )
        mainarg.add_argument("--train_steps",
                             type=int,
                             default=250000,
                             help="How many training steps per epoch.")
        mainarg.add_argument("--test_steps",
                             type=int,
                             default=125000,
                             help="How many testing steps after each epoch.")
        mainarg.add_argument("--epochs",
                             type=int,
                             default=200,
                             help="How many epochs to run.")
        mainarg.add_argument(
            "--start_epoch",
            type=int,
            default=0,
            help=
            "Start from this epoch, affects exploration rate and names of saved snapshots."
        )
        mainarg.add_argument(
            "--play_games",
            type=int,
            default=0,
            help="How many games to play, suppresses training and testing.")
        mainarg.add_argument("--load_weights", help="Load network from file.")
        mainarg.add_argument(
            "--save_weights_prefix",
            help=
            "Save network to given file. Epoch and extension will be appended."
        )
        mainarg.add_argument("--csv_file",
                             help="Write training progress to this file.")

        comarg = parser.add_argument_group('Common')
        comarg.add_argument("--random_seed",
                            type=int,
                            help="Random seed for repeatable experiments.")
        comarg.add_argument(
            "--log_level",
            choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
            default="INFO",
            help="Log level.")
        args = parser.parse_args()
        return args
Exemplo n.º 21
0
class DFP_agent(Agent):
    """
    DFP agent implementation (for more details, look at https://arxiv.org/abs/1611.01779)
    Subclass of Abstract class Agent
    """
    def __init__(
            self,
            image_params,
            measure_params,
            goal_params,
            expectation_params,
            action_params,
            nb_action,
            logger,
            goal_mode='fixed',
            optimizer_params={
                'type': 'adam',
                'beta_1': 0.94,
                'epsilon': 10e-4,
                'lr': 10e-4,
                'clipvalue': 10
            },
            leaky_param=0.2,
            features=['frag_count', 'health', 'sel_ammo'],
            variables=['ENNEMY'],
            replay_memory={
                'max_size': 20000,
                'screen_shape': (84, 84)
            },
            decrease_eps=lambda epi: 0.1,
            step_btw_train=8,
            step_btw_save=2000,
            episode_time=1000,
            frame_skip=4,
            batch_size=64,
            time_steps=[1, 2, 4, 8, 16, 32],
            time_discount=[0., 0., 0., 0.5, 0.5, 1.],
            rel_weight=[0.5, 0.5, 1]):
        """
        Read bot parameters from different dicts and initialize the bot
        Inputs :
            dico_init_network
            dico_init_policy
        """
        #Initialize params
        self.batch_size = batch_size
        self.step_btw_train = step_btw_train
        self.step_btw_save = step_btw_save
        self.time_steps = time_steps
        self.time_discount = time_discount
        self.rel_weight = rel_weight
        self.nb_action = nb_action
        self.episode_time = episode_time
        self.frame_skip = frame_skip
        self.goal_mode = goal_mode

        self.logger = logger
        self.replay_memory_p = replay_memory
        self.variables = variables
        self.features = features
        self.n_features = len(self.features)
        self.n_goals = len(self.features) * len(self.time_steps)
        self.n_variables = len(self.variables)
        self.replay_memory = {
            'screen_shape': replay_memory['screen_shape'],
            'n_variables': self.n_variables,
            'n_features': self.n_features
        }
        self.image_size = self.replay_memory['screen_shape'][:2]
        self.decrease_eps = decrease_eps

        # init network
        self.network = self.create_network(image_params, measure_params,
                                           goal_params, expectation_params,
                                           action_params, optimizer_params,
                                           leaky_param)

        # init message
        self.logger.info('agent use {} features : {}'.format(
            self.n_features, self.features))
        self.logger.info('agent use image of size : {}'.format(
            self.image_size))
        self.logger.info(
            'agent use time discount {} with relative weights {}'.format(
                self.time_discount, self.rel_weight))

    def act_opt(self, eps, input_screen, input_game_features, goal):
        """
        Choose action according to the eps-greedy policy using the network for inference
        Inputs : 
            eps : eps parameter for the eps-greedy policy
            goal : column vector encoding the goal for each timesteps and each measures
            screen : raw input from the game
            game_features : raw features from the game
        Returns an action coded by an integer
        """
        # eps-greedy policy used for exploration (if want full exploitation, just set eps to 0)
        if (np.random.rand() < eps):
            action = np.random.randint(0, self.nb_action)
            self.logger.info('random action : {}'.format(action))
        else:
            # use trained network to choose action
            pred_measure = self.network.predict([
                input_screen[None, :, :, None], input_game_features[None, :],
                goal[None, :]
            ])
            pred_measure_calc = np.reshape(pred_measure,
                                           (self.nb_action, len(goal)))
            list_act = np.dot(pred_measure_calc, goal)
            action = np.argmax(list_act)
            self.logger.info('pred : {}'.format(pred_measure))
            self.logger.info('list_act : {}'.format(list_act))
            self.logger.info('opt action : {}'.format(action))
        return action

    def read_input_state(self,
                         screen,
                         game_features,
                         last_states,
                         after=False,
                         MAX_RANGE=255.,
                         FEATURE_RANGE=100.):
        """
        Use grey level image and specific image definition
        """
        screen_process = screen
        if len(screen.shape) == 3:
            if screen.shape[-1] != 3:
                screen = np.moveaxis(screen, 0, -1)
            screen_process = cv2.cvtColor(screen, cv2.COLOR_BGR2GRAY)
        input_screen = cv2.resize(screen_process, self.image_size)
        input_screen = input_screen / MAX_RANGE - 0.5
        input_game_features = np.zeros(self.n_features)
        i = 0
        for features in self.features:
            input_game_features[
                i] = game_features[features] / FEATURE_RANGE - 0.5
            i += 1

        if not after:
            last_states.append(input_screen)
            return input_screen, input_game_features

        else:
            return input_screen, input_game_features

    def train(self, experiment, nb_episodes, map_id):
        """
        Train the bot according to an eps-greedy policy
        Use a replay memory (see dedicated class)
        Inputs :
            experiment : object from the experiment class, which contains the game motor
        """
        nb_all_steps = 0
        self.loss = []
        # create game from experiment
        experiment.start(map_id=map_id,
                         episode_time=self.episode_time,
                         log_events=False)

        # create replay memory
        self.replay_mem = ReplayMemory(self.replay_memory_p['max_size'],
                                       self.replay_memory_p['screen_shape'],
                                       type_network='DFP',
                                       n_features=self.n_features,
                                       n_goals=self.n_goals)

        # run training
        for episode in range(nb_episodes):
            print('episode {}'.format(episode))
            self.logger.info('episode {}'.format(episode))

            # initialize goal for each episode
            assert self.goal_mode in ['fixed', 'random_1', 'random_2']
            assert len(self.rel_weight) == self.n_features
            if self.goal_mode == 'fixed':
                goal = np.array(self.rel_weight)
            if self.goal_mode == 'random_1':
                goal = np.random.uniform(0, 1, size=self.n_features)
            if self.goal_mode == 'random_2':
                goal = np.random.uniform(-1, 1, size=self.n_features)

            # only finals reset are taken into account
            goal = np.outer(np.array(self.time_discount), goal).flatten()

            if episode == 0:
                experiment.new_episode()
            else:
                self.logger.info('eps_ellapsed is {}'.format(nb_step))
                experiment.reset()

            # variables
            last_states = []
            nb_step = 0

            while not experiment.is_final():

                # decrease eps according to a fixed policy
                eps = self.decrease_eps(nb_all_steps)
                self.logger.info('eps for episode {} is {}'.format(
                    nb_all_steps, eps))

                # get screen and features from the game
                screen, game_variables, game_features = experiment.observe_state(
                    self.variables, self.features)

                # choose action
                input_screen, input_game_features = self.read_input_state(
                    screen, game_features, last_states)
                self.logger.info('features for episode {} is {}'.format(
                    nb_all_steps, input_game_features))
                action = self.act_opt(eps, input_screen, input_game_features,
                                      goal)

                # make action and observe resulting measurement (plays the role of the reward)
                r, screen_next, variables_next, game_features_next = experiment.make_action(
                    action, self.variables, self.features, self.frame_skip)

                # calculate reward based on goal an
                if not experiment.is_final():
                    input_screen_next, input_game_features_next = self.read_input_state(
                        screen, game_features, last_states, True)
                else:
                    input_screen_next = None

                self.replay_mem.add(screen1=last_states[-1],
                                    action=action,
                                    reward=r,
                                    features=input_game_features,
                                    is_final=experiment.is_final(),
                                    screen2=input_screen_next,
                                    goals=goal)

                # train network if needed
                if (nb_step % self.step_btw_train == 0) and (
                        nb_all_steps > self.time_steps[-1]) and (nb_step > 0):
                    print('updating network')
                    self.logger.info('updating network')
                    loss = self.train_network(self.replay_mem)
                    self.loss.append(loss)

                # count nb of steps since start
                nb_step += 1
                nb_all_steps += 1

            # save important features on-line
            if (episode % self.step_btw_save == 0) and (episode > 0):
                print('saving params')
                self.logger.info('saving params')
                saving_stats(episode, experiment.stats, self.network,
                             'DFP_{}'.format(experiment.scenario))

    def train_network(self, replay_memory):
        """
        train the network according to a batch size and a replay memory
        """
        # Load a batch from replay memory
        hist_size = self.time_steps
        batch = replay_memory.get_batch(self.batch_size, hist_size)

        # Store the training input
        input_screen1 = batch['screens1'][:, 0, :, :]
        action = batch['actions'][:, 0]
        current_features = batch['features'][:, 0, :]
        # define f = m_t - m_tau
        future_features = batch['features'][:,
                                            1:, :] - current_features[:,
                                                                      None, :]
        future_features = np.reshape(
            future_features,
            (future_features.shape[0],
             future_features.shape[1] * future_features.shape[2]))
        current_goal = batch['goals'][:, 0, :]

        #        print('coucou')
        # Predict features target
        feature_target = self.network.predict(
            [input_screen1[:, :, :, None], current_features,
             current_goal])  # flatten vector nb_actions * len(goa)
        feature_target_reshape = np.reshape(
            feature_target,
            (feature_target.shape[0], self.nb_action, self.n_goals))

        # change value to predict with observed features
        feature_target_reshape[range(feature_target_reshape.shape[0]),
                               action, :] = future_features
        f_target = np.reshape(
            feature_target_reshape,
            (feature_target.shape[0], self.nb_action * self.n_goals))

        # compute the gradient and update the weights
        loss = self.network.train_on_batch(
            [input_screen1[:, :, :, None], current_features, current_goal],
            f_target)
        self.logger.info('loss is {}'.format(loss))

        return loss

    def decrease_eps(self, step):
        return (0.02 + 145000. / (float(step) + 150000.))

    @staticmethod
    def create_network(image_params,
                       measure_params,
                       goal_params,
                       expectation_params,
                       action_params,
                       optimizer_params,
                       leaky_param,
                       norm=True,
                       split=True):
        """
        Create the neural network proposed in the paper
        Inputs: 
            image_params : dict with keys
            measure_params = dict with keys
            goal_params = dict with keys
            norm : to add normalization step
            split :  to add expectation stream
        Returns a flatten tensor with dims (nb_actions*goal_input_size) obtained with Flatten
        """
        # check network parameters
        screen_input_size, s1, s2, s3, s4 = parse_image_params(image_params)
        measure_input_size, m1, m2, m3 = parse_measure_params(measure_params)
        goal_input_size, g1, g2, g3 = parse_goal_params(goal_params)
        nb_actions, a1 = parse_action_params(action_params)
        e1 = parse_expectation_params(expectation_params)

        # Define optimizer
        optimizer = get_optimizer(optimizer_params)

        # Image stream
        screen_input = Input(shape=screen_input_size)
        s1 = Conv2D(s1['channel'], (s1['kernel'], s1['kernel']),
                    strides=(s1['stride'], s1['stride']),
                    activation='linear',
                    kernel_initializer='he_normal')(screen_input)
        s1 = LeakyReLU(alpha=leaky_param)(s1)
        s2 = Conv2D(s2['channel'], (s2['kernel'], s2['kernel']),
                    strides=(s2['stride'], s2['stride']),
                    activation='linear',
                    kernel_initializer='he_normal')(s1)
        s2 = LeakyReLU(alpha=leaky_param)(s2)
        s3 = Conv2D(s3['channel'], (s3['kernel'], s3['kernel']),
                    strides=(s3['stride'], s3['stride']),
                    activation='linear',
                    kernel_initializer='he_normal')(s2)
        s3 = LeakyReLU(alpha=leaky_param)(s3)
        sf = Flatten()(s3)
        s4 = Dense(s4['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(sf)
        s4 = LeakyReLU(alpha=leaky_param)(s4)

        # Measurement stream
        measure_input = Input(shape=(measure_input_size, ))
        m1 = Dense(m1['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(measure_input)
        m1 = LeakyReLU(alpha=leaky_param)(m1)
        m2 = Dense(m2['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(m1)
        m2 = LeakyReLU(alpha=leaky_param)(m2)
        m3 = Dense(m3['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(m2)
        m3 = LeakyReLU(alpha=leaky_param)(m3)

        # Goal stream
        goal_input = Input(shape=(goal_input_size, ))
        g1 = Dense(g1['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(goal_input)
        g1 = LeakyReLU(alpha=leaky_param)(g1)
        g2 = Dense(g2['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(g1)
        g2 = LeakyReLU(alpha=leaky_param)(g2)
        g3 = Dense(g3['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(g2)
        g3 = LeakyReLU(alpha=leaky_param)(g3)

        # Concatenate (image,measure,goal)
        concat = Concatenate()([s4, m3, g3])

        # Action stream with normalisation or not
        a1 = Dense(a1['output'],
                   activation='linear',
                   kernel_initializer='he_normal')(concat)
        a1 = LeakyReLU(alpha=leaky_param)(a1)
        pred = Dense(goal_input_size * nb_actions,
                     activation='linear',
                     kernel_initializer='he_normal')(a1)
        pred = LeakyReLU(alpha=leaky_param)(pred)
        pred = Reshape((nb_actions, goal_input_size))(pred)
        if norm == True:
            pred = Lambda(normalize_layer)(pred)

        if split == True:
            # Expectation stream
            e1 = Dense(e1['output'],
                       activation='linear',
                       kernel_initializer='he_normal')(concat)
            e1 = LeakyReLU(alpha=leaky_param)(e1)
            e2 = Dense(goal_input_size,
                       activation='linear',
                       kernel_initializer='he_normal')(e1)
            e2 = LeakyReLU(alpha=leaky_param)(e2)
            pred = Add()([e2, pred])

        pred = Flatten()(pred)

        # Final model
        model = Model(inputs=[screen_input, measure_input, goal_input],
                      outputs=pred)

        # compile model
        model.compile(loss='mse', optimizer=optimizer)

        return model
Exemplo n.º 22
0
class Driver(object):
    '''
    A driver object for the SCRC
    '''

    def __init__(self, args):
        '''Constructor'''
        self.WARM_UP = 0
        self.QUALIFYING = 1
        self.RACE = 2
        self.UNKNOWN = 3
        self.stage = args.stage
        
        self.parser = msgParser.MsgParser()
        self.state = carState.CarState()
        self.control = carControl.CarControl()

        self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0]
        self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0]
        self.num_inputs = 19
        self.num_steers = len(self.steers)
        self.num_speeds = len(self.speeds)
        self.num_actions = self.num_steers + self.num_speeds
        
        self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args)
        self.mem = ReplayMemory(args.replay_size, self.num_inputs, args)
        self.minibatch_size = args.batch_size

        if args.load_replay:
            self.mem.load(args.load_replay)
        if args.load_weights:
            self.net.load_weights(args.load_weights)
        self.save_weights_prefix = args.save_weights_prefix
        self.save_interval = args.save_interval
        self.save_replay = args.save_replay

        self.enable_training = args.enable_training
        self.enable_exploration = args.enable_exploration
        self.save_csv = args.save_csv
        if self.save_csv:
          self.csv_file = open(args.save_csv, "wb")
          self.csv_writer = csv.writer(self.csv_file)
          self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps'])

        self.total_train_steps = 0
        self.exploration_decay_steps = args.exploration_decay_steps
        self.exploration_rate_start = args.exploration_rate_start
        self.exploration_rate_end = args.exploration_rate_end
        self.skip = args.skip

        self.show_sensors = args.show_sensors
        self.show_qvalues = args.show_qvalues

        self.episode = 0
        self.distances = []
        self.onRestart()
        
        if self.show_sensors:
            from sensorstats import Stats
            self.stats = Stats(inevery=8)
        
        if self.show_qvalues:
            from plotq import PlotQ
            self.plotq = PlotQ(self.num_steers, self.num_speeds)

    def init(self):
        '''Return init string with rangefinder angles'''
        self.angles = [0 for x in range(19)]
        
        for i in range(5):
            self.angles[i] = -90 + i * 15
            self.angles[18 - i] = 90 - i * 15
        
        for i in range(5, 9):
            self.angles[i] = -20 + (i-5) * 5
            self.angles[18 - i] = 20 - (i-5) * 5
        
        return self.parser.stringify({'init': self.angles})

    def getState(self):
        #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()])
        #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0
        state = np.array(self.state.getTrack()) / 200.0
        assert state.shape == (self.num_inputs,)
        return state

    def getReward(self, terminal):
        if terminal:
            reward = -1000
        else:
            dist = self.state.getDistFromStart()
            if self.prev_dist is not None:
                reward = max(0, dist - self.prev_dist) * 10
                assert reward >= 0, "reward: %f" % reward
            else:
                reward = 0
            self.prev_dist = dist
            
            #reward -= self.state.getTrackPos()
            #print "reward:", reward
        
        return reward

    def getTerminal(self):
        return np.all(np.array(self.state.getTrack()) == -1)

    def getEpsilon(self):
        # calculate decaying exploration rate
        if self.total_train_steps < self.exploration_decay_steps:
            return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
        else:
            return self.exploration_rate_end
 
    def drive(self, msg):
        # parse incoming message
        self.state.setFromMsg(msg)
        
        # show sensors
        if self.show_sensors:
            self.stats.update(self.state)

        # training
        if self.enable_training and self.mem.count >= self.minibatch_size:
          minibatch = self.mem.getMinibatch()
          self.net.train(minibatch)
          self.total_train_steps += 1
          #print "total_train_steps:", self.total_train_steps

        # skip frame and use the same action as previously
        if self.skip > 0:
            self.frame = (self.frame + 1) % self.skip
            if self.frame != 0:
                return self.control.toMsg()

        # fetch state, calculate reward and terminal indicator  
        state = self.getState()
        terminal = self.getTerminal()
        reward = self.getReward(terminal)
        #print "reward:", reward

        # store new experience in replay memory
        if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None:
            self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal)

        # if terminal state (out of track), then restart game
        if terminal:
            #print "terminal state, restarting"
            self.control.setMeta(1)
            return self.control.toMsg()
        else:
            self.control.setMeta(0)

        # choose actions for wheel and speed
        epsilon = self.getEpsilon()
        if self.enable_exploration and random.random() < epsilon:
            #print "random move"
            steer = random.randrange(self.num_steers)
            #speed = random.randrange(self.num_speeds)
            speed = random.randint(2, self.num_speeds-1)
        else:
            # use broadcasting to efficiently produce minibatch of desired size
            minibatch = state + np.zeros((self.minibatch_size, 1))
            Q = self.net.predict(minibatch)
            assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape)
            #print "steer Q: ", Q[0,:self.num_steers]
            #print "speed Q:", Q[0,-self.num_speeds:]
            steer = np.argmax(Q[0, :self.num_steers])
            speed = np.argmax(Q[0, -self.num_speeds:])
            if self.show_qvalues:
                self.plotq.update(Q[0])
        #print "steer:", steer, "speed:", speed

        # gears are always automatic
        gear = self.gear()

        # set actions
        self.setSteerAction(steer)
        self.setGearAction(gear)
        self.setSpeedAction(speed)

        # remember state and actions 
        self.prev_state = state
        self.prev_steer = steer
        self.prev_speed = speed

        #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count

        #print "reward:", reward, "epsilon:", epsilon

        return self.control.toMsg()

    def gear(self):
        rpm = self.state.getRpm()
        gear = self.state.getGear()
        
        if self.prev_rpm == None:
            up = True
        else:
            if (self.prev_rpm - rpm) < 0:
                up = True
            else:
                up = False
        
        if up and rpm > 7000 and gear < 6:
            gear += 1
        
        if not up and rpm < 3000 and gear > 0:
            gear -= 1
        
        return gear
        
    def setSteerAction(self, steer):
        assert 0 <= steer <= self.num_steers
        self.control.setSteer(self.steers[steer])

    def setGearAction(self, gear):
        assert -1 <= gear <= 6
        self.control.setGear(gear)

    def setSpeedAction(self, speed):
        assert 0 <= speed <= self.num_speeds
        accel = self.speeds[speed]
        if accel >= 0:
            #print "accel", accel
            self.control.setAccel(accel)
            self.control.setBrake(0)
        else:
            #print "brake", -accel
            self.control.setAccel(0)
            self.control.setBrake(-accel)
    
    def onShutDown(self):
        if self.save_weights_prefix:
            self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl")
        
        if self.save_replay:
            self.mem.save(self.save_replay)

        if self.save_csv:
            self.csv_file.close()

    def onRestart(self):
    
        self.prev_rpm = None
        self.prev_dist = None
        self.prev_state = None
        self.prev_steer = None
        self.prev_speed = None
        self.frame = -1

        if self.episode > 0:
            dist = self.state.getDistRaced()
            self.distances.append(dist)
            epsilon = self.getEpsilon()
            print "Episode:", self.episode, "\tDistance:", dist, "\tMax:", max(self.distances), "\tMedian10:", np.median(self.distances[-10:]), \
                "\tEpsilon:", epsilon, "\tReplay memory:", self.mem.count

            if self.save_weights_prefix and self.save_interval > 0 and self.episode % self.save_interval == 0:
                self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl")
                #self.mem.save(self.save_weights_prefix + "_" + str(self.episode) + "_replay.pkl")

            if self.save_csv:
                self.csv_writer.writerow([
                    self.episode, 
                    self.state.getDistFromStart(), 
                    self.state.getDistRaced(), 
                    self.state.getCurLapTime(), 
                    self.state.getLastLapTime(), 
                    self.state.getRacePos(), 
                    epsilon, 
                    self.mem.count,
                    self.total_train_steps
                ])
                self.csv_file.flush()

        self.episode += 1
Exemplo n.º 23
0
class Agent(object):
    def __init__(self, args, sess):
        # CartPole 환경
        self.sess = sess
        self.model = Network(sess, phase='train') # mnist accurcacy model
        self.env = MnistEnvironment(self.model) 
        self.state_size = self.env.state_size
        self.action_size = self.env.action_size
        self.a_bound = self.env.a_bound
        self.train_size = len(self.env.train_images)
        self.test_size = len(self.env.test_images)
        self.learning_rate = args.learning_rate
        self.batch_size = args.batch_size
        self.discount_factor = args.discount_factor
        self.epochs = args.epochs
        self.ENV = Environment(self.env, self.state_size, self.action_size)
        self.replay = ReplayMemory(self.state_size, self.batch_size)
        self.ddpg = DDPG(self.state_size, self.action_size, self.sess, self.learning_rate[0], self.learning_rate[1], 
                         self.replay, self.discount_factor, self.a_bound)

        self.save_dir = args.save_dir
        self.render_dir = args.render_dir
        self.play_dir = args.play_dir

        # initialize
        sess.run(tf.global_variables_initializer())  # tensorflow graph가 다 만들어지고 난 후에 해야됨

        # load pre-trained mnist model
        self.env.model.checkpoint_load()
        
        self.saver = tf.train.Saver()
        self.epsilon = 1
        self.explore = 2e4
        pass

    '''
    def select_action(self, state):
        return np.clip(
            np.random.normal(self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0], self.action_variance), -2,
            2)
        pass
    '''

    def ou_function(self, mu, theta, sigma):
        x = np.ones(self.action_size) * mu
        dx = theta * (mu - x) + sigma * np.random.randn(self.action_size)
        return x + dx

    def noise_select_action(self, state):
        action = self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0]
        noise = self.epsilon * self.ou_function(0, 0.15, 0.25)
        return action + noise

    def select_action(self, state):
        return self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0]

    def train(self):
        scores, episodes = [], []
        for e in range(self.epochs):
            for i, idx in enumerate(np.random.permutation(self.train_size)):
                terminal = False
                score = 0
                state = self.ENV.new_episode(idx)
                state = np.reshape(state, [1, self.state_size])

                while not terminal:
                    action = self.noise_select_action(state)
                    next_state, reward, terminal = self.ENV.act(action)
                    state = state[0]
                    self.replay.add(state, action, reward, next_state, terminal)
    
                    if len(self.replay.memory) >= self.batch_size:
                        self.ddpg.update_target_network()
                        self.ddpg.train_network()
    
                    score += reward
                    state = np.reshape(next_state, [1, self.state_size])
    
                    if terminal:
                        scores.append(score)
                        episodes.append(e)
                        if (i+1)%10 == 0:
                            print('epoch', e+1, 'iter:', f'{i+1:05d}', ' score:', f'{score:.03f}', ' last 10 mean score', f'{np.mean(scores[-min(10, len(scores)):]):.03f}', f'sequence: {self.env.sequence}')
                        if (i+1)%500 == 0:
                            self.ENV.render_worker(os.path.join(self.render_dir, f'{(i+1):05d}.png'))
                        if (i+1)%1000 == 0:
                            self.save()

        pass

    def play(self):
        cor_before_lst, cor_after_lst = [], []
        for idx in range(self.test_size): 
            state = self.ENV.new_episode(idx, phase='test')
            state = np.reshape(state, [1, self.state_size])
    
            terminal = False
            score = 0
            while not terminal:
                action = self.select_action(state)
                next_state, reward, terminal = self.ENV.act(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                score += reward
                state = next_state
#                 time.sleep(0.02)
                if terminal:
                    (cor_before, cor_after) = self.ENV.compare_accuracy()
                    cor_before_lst.append(cor_before)
                    cor_after_lst.append(cor_after)

                    self.ENV.render_worker(os.path.join(self.play_dir, f'{(idx+1):04d}.png'))
                    print(f'{(idx+1):04d} image score: {score}\n')
        print('====== NUMBER OF CORRECTION =======')
        print(f'before: {np.sum(cor_before_lst)}, after: {np.sum(cor_after_lst)}')
    pass

    def save(self):
        checkpoint_dir = os.path.join(self.save_dir, 'ckpt')
        if not os.path.exists(checkpoint_dir):
            os.mkdir(checkpoint_dir)
        self.saver.save(self.sess, os.path.join(checkpoint_dir, 'trained_agent'))

    def load(self):
        checkpoint_dir = os.path.join(self.save_dir, 'ckpt')
        self.saver.restore(self.sess, os.path.join(checkpoint_dir, 'trained_agent'))
Exemplo n.º 24
0
class GaussianDQN(Agent):
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 target_update_frequency,
                 initial_replay_size,
                 max_replay_size,
                 fit_params=None,
                 approximator_params=None,
                 clip_reward=True,
                 update_type='weighted',
                 delta=0.1,
                 store_prob=False,
                 q_max=100,
                 max_spread=None):
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency
        self.update_type = update_type
        self.delta = delta
        self.standard_bound = norm.ppf(1 - self.delta, loc=0, scale=1)
        self.store_prob = store_prob
        self.q_max = q_max
        self.max_spread = max_spread
        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        self._n_updates = 0
        self._epsilon = 1e-7
        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

        super(GaussianDQN, self).__init__(policy, mdp_info)

    @staticmethod
    def _compute_prob_max(mean_list, sigma_list):
        n_actions = len(mean_list)
        lower_limit = mean_list - 8 * sigma_list
        upper_limit = mean_list + 8 * sigma_list
        epsilon = 1e2
        n_trapz = 100
        x = np.zeros(shape=(n_trapz, n_actions))
        y = np.zeros(shape=(n_trapz, n_actions))
        integrals = np.zeros(n_actions)
        for j in range(n_actions):
            if sigma_list[j] < epsilon:
                p = 1
                for k in range(n_actions):
                    if k != j:
                        p *= norm.cdf(mean_list[j],
                                      loc=mean_list[k],
                                      scale=sigma_list[k])
                integrals[j] = p
            else:
                x[:, j] = np.linspace(lower_limit[j], upper_limit[j], n_trapz)
                y[:, j] = norm.pdf(x[:, j],
                                   loc=mean_list[j],
                                   scale=sigma_list[j])
                for k in range(n_actions):
                    if k != j:
                        y[:, j] *= norm.cdf(x[:, j],
                                            loc=mean_list[k],
                                            scale=sigma_list[k])
                integrals[j] = (upper_limit[j] - lower_limit[j]) / (
                    2 * (n_trapz - 1)) * (y[0, j] + y[-1, j] +
                                          2 * np.sum(y[1:-1, j]))

        # print(np.sum(integrals))
        # assert np.isclose(np.sum(integrals), 1)
        with np.errstate(divide='raise'):
            try:
                return integrals / np.sum(integrals)
            except FloatingPointError:
                print(integrals)
                print(mean_list)
                print(sigma_list)
                input()

    def fit(self, dataset):
        mask = np.ones((len(dataset), 2))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask = \
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next, sigma_next, prob_explore = self._next_q(
                next_state, absorbing)

            q = reward + self.mdp_info.gamma * q_next
            sigma = self.mdp_info.gamma * sigma_next
            stacked = np.stack([q, sigma])

            self.approximator.fit(state,
                                  action,
                                  stacked,
                                  prob_exploration=prob_explore,
                                  **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                `next_state`.

        Returns:
            Maximum action-value for each state in `next_state`.

        """
        q_and_sigma = self.target_approximator.predict(next_state).squeeze()

        q = q_and_sigma[0, :, :]
        sigma = q_and_sigma[1, :, :]
        for i in range(q.shape[0]):
            if absorbing[i]:
                q[i] *= 0
                sigma[i] *= self._epsilon
        max_q = np.zeros((q.shape[0]))
        max_sigma = np.zeros((q.shape[0]))
        probs = []
        prob_explore = np.zeros(q.shape[0])
        for i in range(q.shape[0]):  # for each batch
            means = q[i, :]
            sigmas = sigma[i, :]
            prob = GaussianDQN._compute_prob_max(means, sigmas)
            probs.append(prob)
            prob_explore[i] = 1. - np.max(prob)

        if self.update_type == 'mean':
            best_actions = np.argmax(q, axis=1)
            for i in range(q.shape[0]):
                max_q[i] = q[i, best_actions[i]]
                max_sigma[i] = sigma[i, best_actions[i]]
        elif self.update_type == 'weighted':
            for i in range(q.shape[0]):  # for each batch
                means = q[i, :]
                sigmas = sigma[i, :]
                prob = probs[i]
                max_q[i] = np.sum(means * prob)
                max_sigma[i] = np.sum(sigmas * prob)
        elif self.update_type == 'optimistic':
            for i in range(q.shape[0]):  # for each batch
                means = q[i, :]
                sigmas = sigma[i, :]
                bounds = sigmas * self.standard_bound + means
                bounds = np.clip(bounds, -self.q_max, self.q_max)
                next_index = np.random.choice(
                    np.argwhere(bounds == np.max(bounds)).ravel())
                max_q[i] = q[i, next_index]
                max_sigma[i] = sigma[i, next_index]
        else:
            raise ValueError("Update type not implemented")

        return max_q, max_sigma, np.mean(prob_explore)

    def draw_action(self, state):
        action = super(GaussianDQN, self).draw_action(np.array(state))

        return action

    def episode_start(self):
        return
Exemplo n.º 25
0
class Agent:
    def __init__(self, dimO, dimA):
        dimA, dimO = dimA[0], dimO[0]
        self.dimA = dimA
        self.dimO = dimO

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        if FLAGS.icnn_opt == 'adam':
            self.opt = self.adam
        elif FLAGS.icnn_opt == 'bundle_entropy':
            self.opt = self.bundle_entropy
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

        if FLAGS.use_per:
            self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha)
            self.beta_schedule = LinearSchedule(FLAGS.beta_iters,
                                                initial_p=FLAGS.beta0,
                                                final_p=1.0)
        else:
            self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)


        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        self.noise = np.zeros(self.dimA)

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        per_weight = tf.placeholder(tf.float32, [None], "per_weight")

        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        negQ_entr = negQ - entropy(act)
        q = -negQ
        q_entr = -negQ_entr
        act_grad, = tf.gradients(negQ, act)
        act_grad_entr, = tf.gradients(negQ_entr, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            # double Q
            negQ_target = self.negQ(obs_target, act_target)
        negQ_entr_target = negQ_target - entropy(act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target)
        q_target = -negQ_target
        q_target_entr = -negQ_entr_target

        if FLAGS.icnn_opt == 'adam':
            y = tf.where(term_target, rew, rew + discount * q_target_entr)
            y = tf.maximum(q_entr - 1., y)
            y = tf.minimum(q_entr + 1., y)
            y = tf.stop_gradient(y)
            td_error = q_entr - y
        elif FLAGS.icnn_opt == 'bundle_entropy':
            raise RuntimError("Needs checking.")
            q_target = tf.where(term2, rew, rew + discount * q2_entropy)
            q_target = tf.maximum(q_entropy - 1., q_target)
            q_target = tf.minimum(q_entropy + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_entropy - q_target

        if FLAGS.use_per:
            ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0)
        else:
            ms_td_error = tf.reduce_mean(tf.square(td_error), 0)

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses)

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]
        # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i))
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)

        summary_path = os.path.join(model_path, 'board', FLAGS.exp_id)
        summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph)


        if FLAGS.summary:
            if FLAGS.icnn_opt == 'adam':
                tf.summary.scalar('Q', tf.reduce_mean(q))
            elif FLAGS.icnn_opt == 'bundle_entropy':
                tf.summary.scalar('Q', tf.reduce_mean(q_entr))

            tf.summary.scalar('Q_target', tf.reduce_mean(q_target))
            tf.summary.scalar('loss', ms_td_error)
            tf.summary.scalar('reward', tf.reduce_mean(rew))
        merged = tf.summary.merge_all()


        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weight],
                              [optimize_q, update_target, loss_q, td_error, q, q_target],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])
            self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr])
            self._fg_entr_target = Fun([obs_target, act_target],
                                       [negQ_entr_target, act_entr_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(model_path + "/tf")
        if  not FLAGS.force and ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def bundle_entropy(self, func, obs):
        act = np.ones((obs.shape[0], self.dimA)) * 0.5
        def fg(x):
            value, grad = func(obs, 2 * x - 1)
            grad *= 2
            return value, grad

        act = bundle_entropy.solveBatch(fg, act)[0]
        act = 2 * act - 1

        return act

    def adam(self, func, obs, plot=False):
        # if npr.random() < 1./20:
        #     plot = True
        b1 = 0.9
        b2 = 0.999
        lam = 0.5
        eps = 1e-8
        alpha = 0.01
        nBatch = obs.shape[0]
        act = np.zeros((nBatch, self.dimA))
        m = np.zeros_like(act)
        v = np.zeros_like(act)

        b1t, b2t = 1., 1.
        act_best, a_diff, f_best = [None]*3
        hist = {'act': [], 'f': [], 'g': []}
        for i in range(1000):
            f, g = func(obs, act)
            if plot:
                hist['act'].append(act.copy())
                hist['f'].append(f)
                hist['g'].append(g)

            if i == 0:
                act_best = act.copy()
                f_best = f.copy()
            else:
                prev_act_best = act_best.copy()
                I = (f < f_best)
                act_best[I] = act[I]
                f_best[I] = f[I]
                a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1))
                a_diff = a_diff_i if a_diff is None \
                         else lam*a_diff + (1.-lam)*a_diff_i
                # print(a_diff_i, a_diff, np.sum(f))
                if a_diff < 1e-3 and i > 5:
                    #print('  + Adam took {} iterations'.format(i))
                    if plot:
                        self.adam_plot(func, obs, hist)
                    return act_best

            m = b1 * m + (1. - b1) * g
            v = b2 * v + (1. - b2) * (g * g)
            b1t *= b1
            b2t *= b2
            mhat = m/(1.-b1t)
            vhat = v/(1.-b2t)

            act -= alpha * mhat / (np.sqrt(v) + eps)
            # act = np.clip(act, -1, 1)
            act = np.clip(act, -1.+1e-8, 1.-1e-8)

        #print('  + Warning: Adam did not converge.')
        if plot:
            self.adam_plot(func, obs, hist)
        return act_best

    def adam_plot(self, func, obs, hist):
        hist['act'] = np.array(hist['act']).T
        hist['f'] = np.array(hist['f']).T
        hist['g'] = np.array(hist['g']).T
        if self.dimA == 1:
            xs = np.linspace(-1.+1e-8, 1.-1e-8, 100)
            ys = [func(obs[[0],:], [[xi]])[0] for xi in xs]
            fig = plt.figure()
            plt.plot(xs, ys, alpha=0.5, linestyle="--")
            plt.plot(hist['act'][0,0,:], hist['f'][0,:], label="Adam's trace")
            plt.legend()

            os.makedirs(os.path.join(model_path, "adam"), exist_ok=True)
            t = time.time()
            fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t))
            plt.savefig(fname)
            plt.close(fig)
        elif self.dimA == 2:
            assert(False)
        else:
            xs = npr.uniform(-1., 1., (5000, self.dimA))
            ys = np.array([func(obs[[0],:], [xi])[0] for xi in xs])
            epi = np.hstack((xs, ys))
            pca = PCA(n_components=2).fit(epi)
            W = pca.components_[:,:-1]
            xs_proj = xs.dot(W.T)
            fig = plt.figure()

            X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100)
            Z = griddata(xs_proj[:,0], xs_proj[:,1], ys.ravel(),
                         X, Y, interp='linear')

            plt.contourf(X, Y, Z, 15)
            plt.colorbar()

            adam_x = hist['act'][:,0,:].T
            adam_x = adam_x.dot(W.T)
            plt.plot(adam_x[:,0], adam_x[:,1], label='Adam', color='k')
            plt.legend()

            os.makedirs(os.path.join(model_path, "adam"), exist_ok=True)
            t = time.time()
            fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t))
            plt.savefig(fname)
            plt.close(fig)

    def reset(self, obs):
        self.noise = np.zeros(self.dimA)
        self.observation = obs  # initial observation

    def act(self, test=False):
        with self.sess.as_default():
            #print('--- Selecting action, test={}'.format(test))
            obs = np.expand_dims(self.observation, axis=0)

            if FLAGS.icnn_opt == 'adam':
                f = self._fg_entr
                # f = self._fg
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

            tflearn.is_training(False)
            action = self.opt(f, obs)
            tflearn.is_training(not test)

            if not test:
                self.noise -= FLAGS.outheta*self.noise - \
                              FLAGS.ousigma*npr.randn(self.dimA)
                action += self.noise
            action = np.clip(action, -1, 1)

            self.action = np.atleast_1d(np.squeeze(action, axis=0))
            return self.action

    def observe(self, rew, term, obs2, test=False):
        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1

            if FLAGS.use_per:
                self.rm.add(obs1, self.action, rew, obs2, float(term))
            else:
                self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        with self.sess.as_default():
            if FLAGS.use_per:
                experience = self.rm.sample(FLAGS.bsize, beta=self.beta_schedule.value(self.t))
                (obs, act, rew, ob2, term2, weights, batch_idxes) = experience
            else:
                obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)



            #if np.random.uniform() > 0.7 and np.sum(rew > 0.0) >0 :
            #    print("good reward samples", 100*np.sum(rew > 0.0) / FLAGS.bsize)
            if FLAGS.icnn_opt == 'adam':
                # f = self._opt_train_entr
                f = self._fg_entr_target
                # f = self._fg_target
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg_target
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)
            #print('--- Optimizing for training')
            tflearn.is_training(False)
            act2 = self.opt(f, ob2, plot=FLAGS.adam_plot)
            tflearn.is_training(True)

            _, _, loss, td_error, _, _ = self._train(obs, act, rew, ob2, act2,
                                                     term2, weights,
                                                     log=FLAGS.summary,
                                                     global_step=self.t)


            if FLAGS.use_per:
                new_priorities = np.abs(td_error) + FLAGS.eps
                self.rm.update_priorities(batch_idxes, new_priorities)

            self.sess.run(self.proj)
            return loss

    def negQ(self, x, y, reuse=False):
        szs = [FLAGS.l1size, FLAGS.l2size]
        assert(len(szs) >= 1)
        fc = tflearn.fully_connected
        bn = tflearn.batch_normalization
        lrelu = tflearn.activations.leaky_relu

        if reuse:
            tf.get_variable_scope().reuse_variables()

        nLayers = len(szs)
        us = []
        zs = []
        z_zs = []
        z_ys = []
        z_us = []

        reg = 'L2'

        prevU = x
        for i in range(nLayers):
            with tf.variable_scope('u'+str(i)) as s:
                u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg)
                if i < nLayers-1:
                    u = tf.nn.relu(u)
                    if FLAGS.icnn_bn:
                        u = bn(u, reuse=reuse, scope=s, name='bn')
            variable_summaries(u, suffix='u{}'.format(i))
            us.append(u)
            prevU = u

        prevU, prevZ = x, y
        for i in range(nLayers+1):
            sz = szs[i] if i < nLayers else 1
            z_add = []
            if i > 0:
                with tf.variable_scope('z{}_zu_u'.format(i)) as s:
                    zu_u = fc(prevU, szs[i-1], reuse=reuse, scope=s,
                              activation='relu', bias=True,
                              regularizer=reg, bias_init=tf.constant_initializer(1.))
                    variable_summaries(zu_u, suffix='zu_u{}'.format(i))
                with tf.variable_scope('z{}_zu_proj'.format(i)) as s:
                    z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s,
                              bias=False, regularizer=reg)
                    variable_summaries(z_zu, suffix='z_zu{}'.format(i))
                z_zs.append(z_zu)
                z_add.append(z_zu)

            with tf.variable_scope('z{}_yu_u'.format(i)) as s:
                yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True,
                          regularizer=reg, bias_init=tf.constant_initializer(1.))
                variable_summaries(yu_u, suffix='yu_u{}'.format(i))
            with tf.variable_scope('z{}_yu'.format(i)) as s:
                z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False,
                          regularizer=reg)
                z_ys.append(z_yu)
                variable_summaries(z_yu, suffix='z_yu{}'.format(i))
            z_add.append(z_yu)

            with tf.variable_scope('z{}_u'.format(i)) as s:
                z_u = fc(prevU, sz, reuse=reuse, scope=s,
                         bias=True, regularizer=reg,
                         bias_init=tf.constant_initializer(0.))
                variable_summaries(z_u, suffix='z_u{}'.format(i))
            z_us.append(z_u)
            z_add.append(z_u)

            z = tf.add_n(z_add)
            variable_summaries(z, suffix='z{}_preact'.format(i))
            if i < nLayers:
                # z = tf.nn.relu(z)
                z = lrelu(z, alpha=FLAGS.lrelu)
                variable_summaries(z, suffix='z{}_act'.format(i))

            zs.append(z)
            prevU = us[i] if i < nLayers else None
            prevZ = z

        z = tf.reshape(z, [-1], name='energies')
        return z


    def __del__(self):
        self.sess.close()
Exemplo n.º 26
0
class DQN(object):
    # optimizer, learning rate, activation, discount
    # CarPole
    # adam, 0.00001, tanh, 0.9
    # adagrad, 0.00001, tanh, 0.9
    # MountainCar
    # -,-,-

    def __init__(self,
                 layers,
                 hidden,
                 actionspace,
                 statespace,
                 lr=0.00001,
                 dropout=0.1,
                 activation='tanh',
                 discount=0.8,
                 epsilon=0.9,
                 epsilon_wd=0.001,
                 memory=10000,
                 start_turn=100,
                 batch_size=32,
                 update_period=100,
                 *args,
                 **kwargs):

        super(DQN, self).__init__(*args, **kwargs)

        self.discount = discount
        self.actionspace = actionspace
        self.statespace = statespace
        self.epsilon = epsilon
        self.epsilon_wd = epsilon_wd
        self.start_turn = start_turn  # start to train when size of the replay memory reaches to 'start_turn'
        self.batch_size = batch_size
        self.update_period = update_period
        assert start_turn > batch_size
        self.policy = Approxmater(layers, hidden, actionspace, statespace,
                                  dropout, activation)
        self.target_policy = Approxmater(layers, hidden, actionspace,
                                         statespace, dropout, activation)
        self.policy.collect_params().initialize(mx.init.Xavier())
        self.target_policy.collect_params().initialize(mx.init.Xavier())

        self.trainer = gluon.Trainer(self.policy.collect_params(), 'adagrad',
                                     {'learning_rate': lr})
        self.replayMemory = ReplayMemory(memory, actionspace, statespace)
        self.turn = 0
        self._copyto_target()

    def get_action(self, state):
        # trade off between exploration and exploitation using epsilon-greedy approach
        if self.epsilon > 1e-3:
            rand = np.random.choice([True, False],
                                    p=[self.epsilon, 1 - self.epsilon])
            if rand:
                index = np.random.choice(self.actionspace)
                action = np.zeros((self.actionspace, ))
                action[index] = 1
            else:
                state = mx.nd.array(state).reshape((1, self.statespace))
                qvals = np.squeeze(self.policy.forward(state).asnumpy())
                index = np.argmax(qvals)
                action = np.zeros((self.actionspace, ))
                action[index] = 1
            self.epsilon -= self.epsilon_wd
        else:
            state = mx.nd.array(state).reshape((1, self.statespace))
            qvals = np.squeeze(self.policy.forward(state).asnumpy())
            index = np.argmax(qvals)
            action = np.zeros((self.actionspace, ))
            action[index] = 1

        return action, index

    def _feed(self, state, action, reward, nextstate):
        self.replayMemory.add(state, action, reward, nextstate)

    def _copyto_target(self):
        params = []
        target_params = []
        for name, value in self.policy.collect_params().items():
            params.append(mx.nd.array(np.squeeze(value.data().asnumpy())))
        for name, value in self.target_policy.collect_params().items():
            target_params.append(value)

        assert len(params) == len(target_params)

        for i in range(len(params)):
            target_params[i].set_data(params[i])

    def train(self, state, action, reward, nextstate):
        self._feed(state, action, reward, nextstate)
        self.turn += 1

        if self.replayMemory.size() > self.start_turn:

            batch_data = {'state': [], 'action': [], 'return': []}

            memory_batch_data = self.replayMemory.get_minibatch(
                self.batch_size)
            next_maxqs = []
            for i in range(len(memory_batch_data['batch_nextstates'])):
                if memory_batch_data['batch_nextstates'][i] is None:
                    next_maxqs.append(.0)
                else:
                    next_qvals = self.target_policy.forward(
                        mx.nd.array(
                            memory_batch_data['batch_nextstates'][i]).reshape(
                                (1, self.statespace)))
                    next_maxqs.append(np.max(np.squeeze(next_qvals.asnumpy())))

            rets = np.array(memory_batch_data['batch_rewards']
                            ) + self.discount * np.array(next_maxqs)
            batch_data['state'] = memory_batch_data['batch_states']
            batch_data['action'] = memory_batch_data['batch_actions']
            batch_data['return'] = rets

            # mx.nd.squeeze hasn't been supported.
            batch_data_s = mx.nd.array(batch_data['state'])
            batch_data_a = mx.nd.array(batch_data['action'])
            batch_data_r = mx.nd.array(batch_data['return'])

            with mx.autograd.record():
                qvals = self.policy.forward(batch_data_s)
                action_qvals = mx.nd.sum(qvals * batch_data_a, axis=1).reshape(
                    (self.batch_size, ))
                sqrerror = ((action_qvals - batch_data_r)**2).reshape(
                    (self.batch_size, ))
                loss = -mx.nd.sum(sqrerror, axis=0).reshape((1, ))
                loss.backward()
            self.trainer.step(self.batch_size)

        if self.turn % self.update_period:
            self._copyto_target()
Exemplo n.º 27
0
        if epsilon < 0:
            epsilon = 0

        next_obs, reward, done, _ = time_step = env.step(action)
        #env.render() 
        
        terminal = 0
        reward = 0
        if done:
            terminal = 1
            if not step >= 195:
                reward = -1
        sum_reward += reward

        # メモリに追加
        memory.add(obs, action, reward, next_obs, terminal)
        obs = next_obs.copy()
        
        step += 1
        total_step += 1
        if total_step < initial_exploration:
            continue

        ###################
        ### 学習フェーズ ###
        ###################

        # メモリからバッチを取得
        batch = memory.sample()

        # Q値を出力 & 実際にとった行動のindex(batch['acs'])のもののみ抜き出す
Exemplo n.º 28
0
class Agent():
    def __init__(self,
                 device,
                 state_size,
                 actions_size,
                 alpha,
                 gamma,
                 TAU,
                 update_every,
                 buffer_size,
                 batch_size,
                 LR,
                 CHECKPOINT_FOLDER='./'):

        self.DEVICE = device

        self.state_size = state_size
        self.actions_size = actions_size

        self.ALPHA = alpha
        self.GAMMA = gamma
        self.TAU = TAU
        self.UPDATE_EVERY = update_every
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.LR = LR

        self.CHECKPOINT_FOLDER = CHECKPOINT_FOLDER

        self.model = Model(state_size, actions_size).to(self.DEVICE)
        self.target_model = Model(state_size, actions_size).to(self.DEVICE)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.LR)

        if os.path.isfile('checkpoint.pth'):
            self.model.load_state_dict(torch.load('checkpoint.pth'))
            self.target_model.load_state_dict(torch.load('checkpoint.pth'))

        self.memory = ReplayMemory(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.DEVICE)

        self.t_step = 0

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.DEVICE)
        self.model.eval()
        with torch.no_grad():
            action_values = self.model(state)
        self.model.train()

        if np.random.uniform() < eps:
            return random.choice(np.arange(self.actions_size))
        else:
            action = np.argmax(action_values.cpu().data.numpy())
            return action

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.target_model(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_target = self.ALPHA * (rewards + self.GAMMA * Q_targets_next *
                                 (1 - dones))

        Q_value = self.model(states).gather(1, actions)

        loss = F.smooth_l1_loss(Q_value, Q_target)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target model
        self.soft_update_target_model()

    def soft_update_target_model(self):
        for target_param, local_param in zip(self.target_model.parameters(),
                                             self.model.parameters()):
            target_param.data.copy_(self.TAU * local_param.data +
                                    (1.0 - self.TAU) * target_param.data)

    def checkpoint(self):
        torch.save(self.model.state_dict(),
                   self.CHECKPOINT_FOLDER + 'checkpoint.pth')
Exemplo n.º 29
0
class Actor:
    def __init__(self, actor_id, n_actors, shared_dict, device='cpu'):
        # params
        self.gamma = 0.99
        self.epsilon = 0.4 ** (1 + actor_id * 7 / (n_actors - 1))
        self.bootstrap_steps = 3
        self.alpha = 0.6
        self.priority_epsilon = 1e-6
        self.device = device
        self.actor_id = actor_id

        # path
        self.memory_path = os.path.join(
            './', 'logs', 'memory')

        # memory
        self.memory_size = 50000
        self.batch_size = 32
        self.action_repeat = 4
        self.n_stacks = 4
        self.burn_in_length = 10
        self.learning_length = 10
        self.overlap_length = 10
        self.eta = 0.9
        self.sequence_length = self.burn_in_length + self.learning_length
        self.stack_count = self.n_stacks // self.action_repeat
        self.memory_save_interval = 5
        self.episode_start_index = 0
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps)

        # net
        self.shared_dict = shared_dict
        self.net_load_interval = 5
        self.net = QNet(self.device).to(self.device)
        self.target_net = QNet(self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())

        # env
        self.env = PongEnv(self.action_repeat, self.n_stacks)
        self.episode_reward = 0
        self.n_episodes = 0
        self.n_steps = 0
        self.memory_count = 0
        self.state = self.env.reset()
    
    def run(self):
        while True:
            self.step()

    def step(self):
        state = self.state
        action, q_value, h, c, target_q_value, target_h, target_c = self.select_action(state)
        q_value = q_value.detach().cpu().numpy()
        target_q_value = target_q_value.detach().cpu().numpy()
        next_state, reward, done, _ = self.env.step(action)
        self.episode_reward += reward
        self.n_steps += 1

        self.n_steps_memory.add(q_value, state[-self.action_repeat:], h, c, target_h, target_c, action, reward, self.stack_count)
        if self.stack_count > 1:
            self.stack_count -= 1
        
        if self.n_steps > self.bootstrap_steps:
            pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get()
            priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done)
            self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority)
            self.memory_count += 1
        self.state = next_state.copy()

        if done:
            while self.n_steps_memory.size > 0:
                pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get()
                priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done)
                self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority)
                self.memory_count += 1
            self.reset()
    
    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_value, h, c = self.net(state, True)
            target_q_value, target_h, target_c = self.target_net(state, True)
        if np.random.random() < self.epsilon:
            action = np.random.randint(6)
        else:
            action = q_value.argmax().item()

        return action, q_value, h, c, target_q_value, target_h, target_c
    
    def reset(self):
        if self.n_episodes % 1 == 0:
            print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'return:', self.episode_reward)

        self.net.reset()
        self.target_net.reset()
        self.set_seq_start_index()
        self.state = self.env.reset()
        self.episode_start_index = self.replay_memory.index
        self.episode_reward = 0
        self.n_episodes += 1
        self.n_steps = 0
        self.memory_count = 0
        self.stack_count = self.n_stacks // self.action_repeat

        # reset n_step memory
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)

        # save replay memory
        if self.n_episodes % self.memory_save_interval == 0:
            self.replay_memory.save(self.memory_path, self.actor_id)
            self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps)
            self.episode_start_index = 0
            gc.collect()
        
        # load net
        if self.n_episodes % self.net_load_interval == 0:
            self.load_model()
    
    def load_model(self):
        try:
            self.net.load_state_dict(self.shared_dict['net_state'])
            self.target_net.load_state_dict(self.shared_dict['target_net_state'])
        except:
            print('load error')

    def calc_priority(self, q_value, action, reward, next_q_value, target_next_q_value, done):
        q_value = q_value.reshape(-1)[action]
        target_next_q_value = target_next_q_value.reshape(-1)

        if done:
            target_q_value = reward
        else:
            next_action = next_q_value.argmax(-1)
            target_next_q_value = target_next_q_value[next_action]
            target_q_value = reward + (self.gamma**self.bootstrap_steps) * target_next_q_value
        priority = np.abs(q_value - target_q_value) + self.priority_epsilon
        priority = priority ** self.alpha
    
        return priority
    
    def set_seq_start_index(self):
        last_index = self.replay_memory.index
        start_index  = self.episode_start_index

        seq_start_index = [i for i in range(start_index, last_index-self.sequence_length, self.overlap_length)]
        seq_start_index.append(last_index - self.sequence_length)
        seq_start_index = np.array(seq_start_index)
        self.replay_memory.update_sequence_priority(seq_start_index)
        self.replay_memory.memory['is_seq_start'][seq_start_index] = 1
Exemplo n.º 30
0
class DQNAgent(tf.keras.Model):
    def __init__(self,
                 state_shape=(-1, 80, 80, 1),
                 action_dim=4,
                 checkpoint_directory="models_checkpoints/rl/",
                 batch_size=BATCH_SIZE,
                 initial_epsilon=INITIAL_EPSILON,
                 final_epsilon=FINAL_EPSILON,
                 exploration_steps=EXPLORATION_STEPS,
                 observation_steps=OBSERVATION_STEPS,
                 loading_step=None,
                 device_name='cpu:0'):

        super(DQNAgent, self).__init__()
        # state's shape , in Atari we will use (-1, 105, 80, 1)
        self.state_shape = state_shape
        # number of actions, in Atari 4
        self.action_dim = action_dim
        # saving checkpoint directory
        self.checkpoint_directory = checkpoint_directory

        self.initial_epsilon = initial_epsilon
        self.final_epsilon = final_epsilon

        # init q layers
        self.conv1 = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu)
        self.batch1 = tf.layers.BatchNormalization()
        self.conv2 = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu)
        self.batch2 = tf.layers.BatchNormalization()
        self.conv3 = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu)
        self.flatten = tf.layers.Flatten()

        self.dense1 = tf.layers.Dense(512, activation=tf.nn.relu)
        self.dense2 = tf.layers.Dense(action_dim, activation=None)

        self.base_layers = [self.conv1, self.batch1, self.conv2, self.batch2, self.conv3, self.flatten, self.dense1,
                            self.dense2]

        # target q layers
        self.conv1_t = tf.layers.Conv2D(32, 8, 8, padding='same', activation=tf.nn.relu)
        self.batch1_t = tf.layers.BatchNormalization()
        self.conv2_t = tf.layers.Conv2D(64, 4, 4, padding='same', activation=tf.nn.relu)
        self.batch2_t = tf.layers.BatchNormalization()
        self.conv3_t = tf.layers.Conv2D(64, 3, 3, padding='same', activation=tf.nn.relu)
        self.flatten_t = tf.layers.Flatten()

        self.dense1_t = tf.layers.Dense(512, activation=tf.nn.relu)
        self.dense2_t = tf.layers.Dense(action_dim, activation=None)

        self.target_layers = [self.conv1_t, self.batch1_t, self.conv2_t, self.batch2_t, self.conv3_t, self.flatten_t,
                              self.dense1_t, self.dense2_t]

        # learning optimizer
        self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE)

        # epsilon-greedy
        self.epsilon = initial_epsilon
        self.epsilon_step = (initial_epsilon - final_epsilon) / exploration_steps

        # replay_memory
        self.replay_memory = ReplayMemory(500000)
        self.batch_size = batch_size

        # for logging
        self.step_count = 0
        self.sum_loss = 0;

        # loading
        if loading_step == "latest":
            self.load_last_checkpoint()

        elif loading_step:
            self.load_specific_checkpoint(loading_step)
            self.step_count += loading_step

        self.observation_steps = observation_steps + self.step_count
        self.exploration_steps = exploration_steps + self.step_count

        # device configuration
        self.device_name = device_name

    def predict(self, state_batch, training):

        # you can use prediction with numpy array state input
        if isinstance(state_batch, (np.ndarray, np.generic)):
            state_batch = np.reshape(state_batch, self.state_shape)
            state_batch = tf.convert_to_tensor(state_batch)

        x = self.conv1(state_batch)
        x = self.batch1(x, training=training)
        x = self.conv2(x)
        x = self.batch2(x, training=training)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dense2(x)

        return x

    def predict_target(self, state_batch, training):

        # you can use prediction with numpy array state input
        if isinstance(state_batch, (np.ndarray, np.generic)):
            state_batch = np.reshape(state_batch, self.state_shape)
            state_batch = tf.convert_to_tensor(state_batch)

        x = self.conv1_t(state_batch)
        x = self.batch1_t(x, training=training)
        x = self.conv2_t(x)
        x = self.batch2_t(x, training=training)
        x = self.conv3_t(x)
        x = self.flatten_t(x)
        x = self.dense1_t(x)
        x = self.dense2_t(x)

        return x

    def copy_base_to_target(self):
        """copy base's weights to target"""
        for idx_layer in range(len(self.base_layers)):
            base = self.base_layers[idx_layer]
            target = self.target_layers[idx_layer]
            for idx_weight in range(len(base.weights)):
                tf.assign(target.weights[idx_weight], base.weights[idx_weight])
            if hasattr(base, "bias"):
                tf.assign(target.bias, base.bias)

    @staticmethod
    def huber_loss(labels, predictions):
        error = labels - predictions
        quadratic_term = error * error / 2
        linear_term = abs(error) - 1 / 2
        use_linear_term = tf.convert_to_tensor((abs(error) > 1.0).numpy().astype("float32"))

        return use_linear_term * linear_term + (1 - use_linear_term) * quadratic_term

    def loss(self, state_batch, target, training):
        predictions = self.predict(state_batch, training)
        # loss_value = tf.losses.mean_squared_error(labels=target, predictions=predictions)
        loss_value = self.huber_loss(labels=target, predictions=predictions)
        self.sum_loss += tf.reduce_sum(loss_value).numpy()
        return loss_value

    def grad(self, state_batch, target, training):
        with tfe.GradientTape() as tape:
            loss_value = self.loss(state_batch, target, training)
        return tape.gradient(loss_value, self.variables)

    def get_action(self, state, training=False):
        if training:
            if self.epsilon >= random.random():
                action = tf.convert_to_tensor(random.randrange(self.action_dim))
            else:
                action = tf.argmax(self.predict(state, training=training), 1)

            if self.epsilon > self.final_epsilon and self.step_count > self.observation_steps:
                self.epsilon -= self.epsilon_step

            return action

        else:
            return tf.argmax(self.predict(state, training=training), 1)

    def step(self, state, action, reward, next_state, terminal):
        if self.step_count <= self.observation_steps:
            self.observe(state, action, reward, next_state, terminal)
            if self.step_count % 5000 == 0:
                print("OBSERVATION %s : EPSILON [%6f]...." % (self.step_count, self.epsilon))
        else:
            self.fit(state, action, reward, next_state, terminal)



        self.step_count += 1

    def observe(self, state, action, reward, next_state, terminal):
        self.replay_memory.add(state, action, reward, next_state, terminal)

    def fit(self, state, action, reward, next_state, terminal, num_epochs=1):

        self.replay_memory.add(state, action, reward, next_state, terminal)

        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.replay_memory.get_batch(
            self.batch_size)

        now_q = np.zeros((self.batch_size, self.action_dim))

        target_q_batch = self.predict_target(next_state_batch, training=False)

        y_batch = reward_batch * REWARD_WEIGHT + (1 - terminal_batch) * GAMMA * np.max(target_q_batch, axis=1)

        for i in range(self.batch_size):
            now_q[i, action_batch[i]] = y_batch[i]

        with tf.device(self.device_name):
            for i in range(num_epochs):
                grads = self.grad(state_batch, now_q, True)
                self.optimizer.apply_gradients(zip(grads, self.variables))

        if self.step_count % 5000 == 0:
            print("STEP %s : EPSILON [%6f]...." % (self.step_count, self.epsilon))
            print("loss: %6f" % (self.sum_loss / 10000))
            self.sum_loss = 0
            print(self.predict(state_batch[0], training=False).numpy())
            print("=============================================")
            self.save(global_step=self.step_count)

        return

    def save(self, global_step=0):
        tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=global_step)

    def load_last_checkpoint(self):
        # Run the model once to initialize variables
        initialshape = list(self.state_shape)
        initialshape[0] = 1
        initialshape = tuple(initialshape)
        dummy_input = tf.constant(tf.zeros(initialshape))
        dummy_pred = self.predict(dummy_input, training=False)
        # Restore the variables of the model
        saver = tfe.Saver(self.variables)
        from colorama import Fore, Style
        print(Fore.CYAN + "loading " + tf.train.latest_checkpoint(self.checkpoint_directory))
        print(Style.RESET_ALL)
        saver.restore(tf.train.latest_checkpoint
                      (self.checkpoint_directory))
        self.step_count = int(tf.train.latest_checkpoint(self.checkpoint_directory).split('/')[-1][1:])

    def load_specific_checkpoint(self, step_number):
        # Run the model once to initialize variables
        initialshape = list(self.state_shape)
        initialshape[0] = 1
        initialshape = tuple(initialshape)
        dummy_input = tf.constant(tf.zeros(initialshape))
        dummy_pred = self.predict(dummy_input, training=False)
        # Restore the variables of the model
        saver = tfe.Saver(self.variables)
        name = self.checkpoint_directory + "-" + str(step_number)
        from colorama import Fore, Style
        print(Fore.CYAN + "loading " + name)

        print(Style.RESET_ALL)
        saver.restore(name)
Exemplo n.º 31
0
            action = None
            if np.random.rand(1) < random_action_probability:
                action = env.action_space.sample()
            else:
                if global_step % 2 == 0:
                    action = estimator_1.predict(sess, [state])[0]
                else:
                    action = estimator_2.predict(sess, [state])[0]

            if random_action_probability > random_action_probability_end:
                random_action_probability *= random_action_probability_decay

            next_state, reward, done, _ = env.step(action)

            replay_memory.add(state, action, reward, next_state, done)

            batch_s, batch_a, batch_r, batch_s1, batch_d = replay_memory.get_samples(
                batch_size)
            if batch_s.shape[0] == batch_size:
                if global_step % 2 == 0:
                    estimator_1.update(sess, estimator_2, batch_s, batch_a,
                                       batch_r, batch_s1, batch_d)
                else:
                    estimator_2.update(sess, estimator_1, batch_s, batch_a,
                                       batch_r, batch_s1, batch_d)

            global_step += 1

            if done:
                recent_timesteps.append(t + 1)
Exemplo n.º 32
0
class Agent():

    def __init__(self, n_actions):
        self.n_actions = n_actions
        self.ep_start = 1
        self.ep = self.ep_start
        self.ep_end = self.ep
        self.ep_endt = 1000000
        self.max_reward = 1
        self.min_reward = 1
        self.valid_size = 500
        self.discount = 0.99
        self.update_freq = 1
        self.n_replay = 1
        self.learn_start = 2000 #50000
        self.hist_len = 1
        self.bestq = 0
        self.nonTermProb = 1
        self.buffer_size = 512
        self.num_steps = 0
        self.last_state = None
        self.last_action = None
        self.v_avg = 0
        self.tderr_avg = 0
        self.q_max = 1
        self.r_max = 1
        self.rescale_r = 1
        self.state_dim = 84*84
        self.replay_memory = ReplayMemory(n_actions)
        self.target_q_net = Model()

    def sample_validation_data(self):
        s,a,r,s2,term = self.replay_memory.sample(self.valid_size)
        self.valid_s = np.copy(s)
        self.valid_a = np.copy(a)
        self.valid_r = np.copy(r)
        self.valid_s2 = np.copy(s2)
        self.valid_term = np.copy(term)
    
    def preprocess(self, state):
        return state.copy().reshape(self.state_dim)     
    
    #FIX TESTING_EP. It should not be 1
    def perceive(self, reward, state, terminal, testing=False, testing_ep=1):
        state = self.preprocess(state)
        
        if self.max_reward:
          reward = min(reward, self.max_reward) #check paper

        if self.min_reward:
          reward = max(reward, self.min_reward)

        if self.rescale_r:
          self.r_max = max(self.r_max, reward)

        self.replay_memory.add_recent_state(state, terminal)
        current_full_state = self.replay_memory.get_recent()

        if not (self.last_state is None) and (not testing):
          self.replay_memory.add(self.last_state, self.last_action, reward, self.last_terminal)

        if self.num_steps == self.learn_start + 1 and not testing:
          self.sample_validation_data()

        curr_state = self.replay_memory.get_recent()

        action_index = 0
        if not terminal:
          action_index = self.e_greedy(curr_state)

        self.replay_memory.add_recent_action(action_index)

        if self.num_steps > self.learn_start and not testing and self_num_steps % self.update_freq == 0:
            self.q_learn_minibatch()

        if not testing:
          self.num_steps += 1

        self.last_state = np.copy(state)
        self.last_action = action_index
        self.last_terminal = terminal

        if not terminal:
          return action_index
        else:
          return -1

    def e_greedy(self, state):
        ep_test = (self.ep_end + max(0, (self.ep_start - self.ep_end)*(self.ep_endt - max(0, self.num_steps - self.learn_start))/self.ep_endt))
        if np.random.uniform(0,1) < ep_test:
          return np.random.randint(self.n_actions)
        else:
          return self.greedy(state)

    def greedy(self, state):
        q = self.network.forward(state)
        maxq = q[0]
        besta = [0]
        for a,v in enumerate(q):
          if v > maxq:
            besta = [a]
            maxq = v
          #can I compare float like that o_O. It's from google!
          elif v == maxq:
            besta.append(a)
        self.bestq = maxq
        self.last_action = random.choice(besta)
        return self.last_action


    def get_q_update(self):
        # delta = r + (1-terminal)*gamma*max_a Q(s2, a) - Q(s,a)

        term = term.clone().float().mul(-1).add(1)

        # max_a Q(s2,a)
        q2_max = target_q_net.forward(s2).float().max(2)

        #compute q2 = (1-terminal) * gamma * max_a Q(s2, a)
        q2  = q2_max * self.discount * term

        delta = r.clone()
        delta.add(q2)

        q_all = self.network.forward(s)
        q = np.zeros(q_all.shape[0])

        for i in range(0, q_all.shape(1)):
         q[i] = q_all[i][a[i]]
        delta.add(-1,q)

        targets = np.zeros(self.minibatch_size, self.n_actions, dtype=np.float)
        for i in range(math.min(self.minibatch_size, a.shape(1))):
         targets[i][a[i]] = delta[i]

        return targets, delta, q2_max

    def q_learn_minibatch(self):
        #w += alpha * (r + gamma max Q(s2, a2) - Q(s, a)) * dQ(s,a) / dw
        s, a, r, s2, term = self.replay_memory.sample(self.minibatch_size)

        targets, delta, q2_max = self.get_q_update(s, a, r, s2, term, update_qmax=True)

        self.dw.zero()
        self.network.backward(s, targets)

        self.dw.add(-self.wc, self.w)

        t = math.max(0, self.num_steps - self.learn_start)
        self.lr = (self.lr_start - self.lr_end) * (self.lr_endt - t)/self.le_endt+ self.lr_end
        self.lr = math.max(self.lr, self.lr_end)

        #use gradients
        self.g*0.95+0.05*self.dw
        tmp = self.dw * self.dw
        self.g2*0.95+0.05*tmp
        tmp*self.g*self.g
        tmp*=-1
        tmp += self.g2
        tmp += 0.01
        tmp = np.sqrt(tmp)

        #accumulate update
        self.w += np.divide(self.dw, tmp)*self.lr