Exemplo n.º 1
0
    env.init()

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False and steps < num_steps_train:
                state = env.getGameState()
                reward, action = agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1

            if num_episodes % 5 == 0:
                print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)

            rewards.append(episode_reward)
class DqnPolicy(BaseTFModel):
    def __init__(self,
                 env,
                 training,
                 name=None,
                 model_path=None,
                 gamma=0.99,
                 lr=0.001,
                 lr_decay=1.0,
                 epsilon=1.0,
                 epsilon_final=0.02,
                 batch_size=32,
                 memory_capacity=100000,
                 model_params={},
                 layer_sizes=[32, 32],
                 target_update_type='hard',
                 target_update_params={},
                 double_q=True,
                 dueling=True,
                 **kwargs):
        if name is None:
            self.name = self.__class__.__name__
        else:
            self.name = name
        if model_path is None:
            self.model_path = os.path.join('model', self.name)
        else:
            self.model_path = model_path
        self.env = env
        self.training = training
        self.gamma = gamma
        self.lr = lr
        self.lr_decay = lr_decay
        self.epsilon = epsilon
        self.epsilon_final = epsilon_final
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.model_params = model_params
        self.layer_sizes = layer_sizes
        self.double_q = double_q
        self.dueling = dueling

        self.target_update_type = target_update_type
        self.target_update_every_step = target_update_params.get(
            'every_step', 100)
        self.target_update_tau = target_update_params.get('tau', 0.05)

        self.memory = ReplayMemory(capacity=memory_capacity)

        self.action_size = self.env.action_space.n
        self.state_size = np.prod(list(self.env.observation_space.shape))
        print 'action_size: {a}, state_size: {s}'.format(a=self.action_size,
                                                         s=self.state_size)

        if self.training:
            # clear existing model files
            if os.path.exists(self.model_path):
                print 'deleting existing model files at {}'.format(
                    self.model_path)
                if os.path.isdir(self.model_path):
                    shutil.rmtree(self.model_path)
                else:
                    os.remove(self.model_path)

        BaseTFModel.__init__(self,
                             self.name,
                             self.model_path,
                             saver_max_to_keep=5)

        print 'building graph ...'
        with self.graph.as_default():
            self.__build_graph()

    def act(self, state, epsilon=0.1):
        """
        :param state: 1d np.ndarray
        :param epsilon:
        :return: int
        """
        assert isinstance(state, np.ndarray) and state.ndim == 1
        if self.training and np.random.random() < epsilon:
            return self.env.action_space.sample()

        with self.sess.as_default():
            return self.actions_selected_by_q.eval(
                {self.states: state.reshape((1, -1))})[0]

    def train(self,
              n_episodes=500,
              annealing_episodes=450,
              every_episode=10,
              **kwargs):
        if self.training is False:
            raise Exception(
                'prohibited to call train() for a non-training model')

        reward_history = [0.0]
        reward_averaged = []
        lr = self.lr
        eps = self.epsilon
        annealing_episodes = annealing_episodes or n_episodes
        eps_drop = (self.epsilon - self.epsilon_final) / annealing_episodes
        print "eps_drop: {}".format(eps_drop)
        step = 0

        # calling the property method of BaseTFModel to start a session
        self.sess.run(self.init_vars)
        self.__init_target_q_net()

        for n_episode in range(n_episodes):
            ob = self.env.reset()
            done = False
            traj = []
            reward = 0.
            while not done:
                a = self.act(ob, eps)
                assert a >= 0
                new_ob, r, done, _ = self.env.step(a)
                step += 1
                reward += r
                traj.append(Transition(ob, a, r, new_ob, done))
                ob = new_ob

                # No enough samples in the buffer yet.
                if self.memory.size < self.batch_size:
                    continue
                # Training with a mini batch of samples
                batch_data = self.memory.sample(self.batch_size)
                feed_dict = {
                    self.learning_rate: lr,
                    self.states: batch_data['s'],
                    self.actions: batch_data['a'],
                    self.rewards: batch_data['r'],
                    self.states_next: batch_data['s_next'],
                    self.done_flags: batch_data['done']
                }

                if self.double_q:
                    actions_next = self.sess.run(
                        self.actions_selected_by_q,
                        {self.states: batch_data['s_next']})
                    feed_dict.update({self.actions_next: actions_next})

                _, q_val, q_target_val, loss, summ_str = self.sess.run(
                    [
                        self.optimizer, self.q, self.q_target, self.loss,
                        self.merged_summary
                    ],
                    feed_dict=feed_dict)
                self.writer.add_summary(summ_str, step)

                # update the target q net if necessary
                self.__update_target_q_net(step)

            self.memory.add(traj)
            reward_history.append(reward)
            reward_averaged.append(np.mean(reward_history[-10:]))

            # Annealing the learning and exploration rate after every episode
            lr *= self.lr_decay
            if eps > self.epsilon_final:
                eps -= eps_drop

            if reward_history and every_episode and n_episode % every_episode == 0:
                print "[episodes: {}/step: {}], best: {}, avg: {:.2f}:{}, lr: {:.4f}, eps: {:.4f}".format(
                    n_episode, step, np.max(reward_history),
                    np.mean(reward_history[-10:]), reward_history[-5:], lr,
                    eps)

        self.save_model(step=step)
        print "[training completed] episodes: {}, Max reward: {}, Average reward: {}".format(
            len(reward_history), np.max(reward_history),
            np.mean(reward_history))

        fig_path = os.path.join(self.model_path, 'figs')
        makedirs(fig_path)
        fig_file = os.path.join(
            fig_path, '{n}-{t}.png'.format(n=self.name, t=int(time.time())))
        plot_learning_curve(fig_file, {
            'reward': reward_history,
            'reward_avg': reward_averaged
        },
                            xlabel='episode')

    def evaluate(self, n_episodes):
        if self.training:
            raise Exception(
                'prohibited to call evaluate() for a training model')

        reward_history = []
        for episode in xrange(n_episodes):
            state = self.env.reset()
            reward_episode = 0.
            while True:
                action = self.act(state)
                new_state, reward, done, _ = self.env.step(action)
                reward_episode += reward
                state = new_state
                if done:
                    break
            reward_history.append(reward_episode)
        return reward_history

    def __build_graph(self):
        self.__create_q_networks()

        # q is the Q(s, a) of the behavior policy
        self.actions_selected_by_q = tf.argmax(self.q,
                                               axis=-1,
                                               name='action_selected')
        action_one_hot = tf.one_hot(self.actions,
                                    self.action_size,
                                    dtype=tf.float32,
                                    name='action_one_hot')
        pred = tf.reduce_sum(self.q * action_one_hot, axis=-1, name='pred')
        # q_target is the Q(s, a) of the target policy that is what we learning for.
        if self.double_q:
            action_next_one_hot = tf.one_hot(self.actions_next,
                                             self.action_size,
                                             dtype=tf.float32,
                                             name='action_next_one_hot')
            max_q_next_target = tf.reduce_sum(self.q_target *
                                              action_next_one_hot,
                                              axis=-1,
                                              name='max_q_next_target')
        else:
            max_q_next_target = tf.reduce_max(self.q_target, axis=-1)
        y = self.rewards + (1. -
                            self.done_flags) * self.gamma * max_q_next_target

        self.loss = tf.reduce_mean(tf.square(pred - tf.stop_gradient(y)),
                                   name="loss_mse_train")
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss, name="adam")
        self.init_vars = tf.global_variables_initializer()
        with tf.variable_scope('summary'):
            q_summ = []
            avg_q = tf.reduce_mean(self.q, 0)
            for idx in range(self.action_size):
                q_summ.append(tf.summary.histogram('q/%s' % idx, avg_q[idx]))
            self.q_summ = tf.summary.merge(q_summ, 'q_summary')

            self.q_y_summ = tf.summary.histogram("batch/y", y)
            self.q_pred_summ = tf.summary.histogram("batch/pred", pred)
            self.loss_summ = tf.summary.scalar("loss", self.loss)

            self.merged_summary = tf.summary.merge_all(
                key=tf.GraphKeys.SUMMARIES)

    def __create_q_networks(self):
        # mini-batch
        self.states = tf.placeholder(tf.float32,
                                     shape=(None, self.state_size),
                                     name='state')
        self.states_next = tf.placeholder(tf.float32,
                                          shape=(None, self.state_size),
                                          name='state_next')
        self.actions = tf.placeholder(tf.int32, shape=(None, ), name='action')
        # actions_next is not the actual actions in the next step;
        # it is used to predict the action value in the Bellman equation.
        self.actions_next = tf.placeholder(tf.int32,
                                           shape=(None, ),
                                           name='action_next')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, ),
                                      name='reward')
        self.done_flags = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name='done')
        self.learning_rate = tf.placeholder(tf.float32,
                                            shape=None,
                                            name='learning_rate')

        if self.dueling:
            with tf.variable_scope('Q_primary'):
                self.q_hidden = dense_nn(self.states,
                                         self.layer_sizes[:-1],
                                         name='q_hidden',
                                         training=self.training)
                # advantage function A(s, a)
                self.adv = dense_nn(self.q_hidden,
                                    [self.layer_sizes[-1], self.action_size],
                                    name='adv',
                                    training=self.training)
                # state value function V(s)
                self.v = dense_nn(self.q_hidden, [self.layer_sizes[-1], 1],
                                  name='v',
                                  training=self.training)
                self.q = self.v + (self.adv - tf.reduce_mean(
                    self.adv, reduction_indices=1, keep_dims=True))

            with tf.variable_scope('Q_target'):
                self.q_target_hidden = dense_nn(self.states_next,
                                                self.layer_sizes[:-1],
                                                name='q_hidden',
                                                training=self.training)
                self.adv_target = dense_nn(
                    self.q_target_hidden,
                    [self.layer_sizes[-1], self.action_size],
                    name='adv',
                    training=self.training)
                self.v_target = dense_nn(self.q_target_hidden,
                                         [self.layer_sizes[-1], 1],
                                         name='v',
                                         training=self.training)
                self.q_target = self.v_target + (
                    self.adv_target - tf.reduce_mean(
                        self.adv_target, reduction_indices=1, keep_dims=True))
        else:
            self.q = dense_nn(self.states,
                              self.layer_sizes + [self.action_size],
                              name='Q_primary',
                              training=self.training)
            self.q_target = dense_nn(self.states_next,
                                     self.layer_sizes + [self.action_size],
                                     name='Q_target',
                                     training=self.training)

        self.q_vars = self.scope_vars('Q_primary')
        self.q_target_vars = self.scope_vars('Q_target')
        assert len(self.q_vars) == len(
            self.q_target_vars), "Two Q-networks are not same in structure."

    def __init_target_q_net(self):
        self.__update_target_q_net_hard()

    def __update_target_q_net_hard(self):
        self.sess.run(
            [v_t.assign(v) for v_t, v in zip(self.q_target_vars, self.q_vars)])

    def __update_target_q_net_soft(self, tau=0.05):
        self.sess.run([
            v_t.assign(v_t * (1. - tau) + v * tau)
            for v_t, v in zip(self.q_target_vars, self.q_vars)
        ])

    def __update_target_q_net(self, step):
        if self.target_update_type == 'hard':
            if step % self.target_update_every_step == 0:
                self.__update_target_q_net_hard()
        else:
            self.__update_target_q_net_soft(self.target_update_tau)
Exemplo n.º 3
0
class Execute:
    def __init__(self, path):
        self.config = Configuration.construct(path)
        self.env = Environment(self.config)
        self.memory = ReplayMemory(self.config)
        self.model = Model(self.config)
        self.ep = None

    def get_epsilon(self, is_play):
        if is_play:
            return self.config.play.ep
        ep_start = self.config.train.ep.start
        ep_final = self.config.train.ep.final
        ep_num_frames = self.config.train.ep.num_frames
        decay = (ep_start - ep_final) / ep_num_frames
        if self.ep is None:
            self.ep = ep_start
        self.ep = max(self.ep - decay, ep_final)
        return self.ep

    def log(self, **kawrgs):
        log = ""
        for name, value in kawrgs.items():
            log += f"{name}: {value}, "
        print(log)

    def run_episode(self, episode=1, steps=0, is_play=True, debug=False):
        config = self.config

        self.env.reset()
        action = 1
        _, _, curr_state, is_done = self.env.step(action)
        total_reward = 0
        update_net = 0; C = config.train.network_update_freq
        t = 0; T = config.max_episode_length

        while not is_done and t < T:
            if t % config.action_repeat == 0:
                ep = self.get_epsilon(is_play)
                action = self.model.choose_action(curr_state, ep)
            prev_state, reward, curr_state, is_done = self.env.step(action)
            total_reward += reward
            t += 1

            if is_play:
                self.env.render("human")
                if debug and t % config.play.debug.time == 0:
                    self.log(ftype=self.env.get_frame_type(), action=action, reward=total_reward)
                continue

            self.memory.add((prev_state, action, reward, curr_state, is_done))
            if self.memory.get_size() > config.train.replay_start_size:
                for i in range(config.train.batch_run):
                    batch = self.memory.sample()
                    self.model.optimize(batch)
                    steps = (steps + 1) % C
                if steps % C == 0:
                    self.model.update_qhat()
                    update_net += 1

        if not is_play and debug and episode % config.train.debug.time == 0:
            self.log(ftype=self.env.get_frame_type(), total_reward=total_reward, network_update_steps=update_net, episode_time=t, ep=ep)

        return total_reward, steps

    def load_model(self):
        ftype = self.env.get_frame_type()
        in_size = self.env.get_in_size()
        num_actions = self.env.get_num_actions()
        self.model.load_model(ftype, in_size, num_actions)

    def play(self, debug=False):
        self.load_model()
        for ep in range(1):
            self.run_episode(is_play=True, debug=debug)

    def train(self, debug=False):
        self.load_model()
        optimize_steps = 0
        episodes = self.config.train.episodes
        for episode in range(1, episodes+1):
            reward, steps = self.run_episode(episode=episode, steps=optimize_steps, is_play=False, debug=debug)
            optimize_steps += steps
            if episode % self.config.train.save_model_episode == 0:
                self.model.save_model()
        self.model.update_qhat()
        self.model.save_model()

    def close(self):
        self.env.close()
        self.memory.close()
Exemplo n.º 4
0
    env.init()

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False and steps < num_steps_train:
                state = env.getGameState()
                reward, action = agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1

            if num_episodes % 5 == 0:
                print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)

            rewards.append(episode_reward)
Exemplo n.º 5
0
    def train(self, config: TrainConfig):
        # experience replay memory
        replay_mem = ReplayMemory(config.memmory_capacity)
        # reward history
        reward = 0
        reward_history = []
        reward_avg = []
        # learning rate related
        alpha = config.lrn_rate
        eps = config.epsilon
        eps_delta = (config.epsilon -
                     config.epsilon_final) / config.warmup_episodes

        step = 0
        for epi in range(config.total_episodes):
            obs = self.env.reset()
            done = False
            traj = []
            reward = 0
            while not done:
                # random choose action with epsilon-greedy
                action = self.act(obs, eps)
                obs_next, r, done, info = self.env.step(action)
                reward += r
                step += 1
                # record trajectories
                traj.append(
                    Transition(obs.flatten(), action, r, obs_next.flatten(),
                               done))
                obs = obs_next
                if replay_mem.size < self.batch_size:
                    continue
                # update q networks with mini-batch replay samples
                batch_data = replay_mem.sample(self.batch_size)
                feed_dict = {
                    self.learning_rate: alpha,
                    self.states: batch_data['s'],
                    self.actions: batch_data['a'],
                    self.rewards: batch_data['r'],
                    self.next_states: batch_data['s_next'],
                    self.dones: batch_data['done'],
                    self.epi_reward: reward_history[-1]
                }
                _, q, q_target, loss, summary = self.session.run([
                    self.optimizer, self.Q, self.Q_target, self.loss,
                    self.merged_summary
                ], feed_dict)
                # update target q networks hardly
                if step % config.target_update_every_steps == 0:
                    self._update_target_q_net()
                self.writer.add_summary(summary)

            replay_mem.add(traj)
            # one episode done
            reward_history.append(reward)
            reward_avg.append(np.mean(reward_history[-10:]))

            # update training param
            alpha *= config.lrn_rate_decay
            if eps > config.epsilon_final:
                eps -= eps_delta

            # report progress
            # if reward_history and config.log_every_episodes and epi % config.log_every_episodes == 0 :
            print(
                "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lrn_rate:{:.4f}, eps:{:.4f}"
                .format(epi, step, np.max(reward_history),
                        np.mean(reward_history[-10:]), reward_history[-5:],
                        alpha, eps))

        self.save_checkpoint(step=step)
        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))
        return {'rwd': reward_history, 'rwd_avg': reward_avg}
Exemplo n.º 6
0
class ActorCriticPolicy(BaseTFModel):
    def __init__(self,
                 env,
                 training,
                 name=None,
                 model_path=None,
                 gamma=0.9,
                 lr_a=0.01,
                 lr_a_decay=0.999,
                 lr_c=0.01,
                 lr_c_decay=0.999,
                 epsilon=1.0,
                 epsilon_final=0.05,
                 batch_size=16,
                 layer_sizes=[32],
                 grad_clip_norm=None,
                 act='bayesian',
                 seed=None,
                 **kwargs):
        """
        :param env:
        :param name:
        :param model_path:
        :param training:
        :param gamma:
        :param lr_a:
        :param lr_a_decay:
        :param lr_c:
        :param lr_c_decay:
        :param epsilon:
        :param epsilon_final:
        :param batch_size:
        :param layer_sizes:
        :param grad_clip_norm:
        :param act: baysian or epsilon
        :param seed:
        """
        if name is None:
            self.name = self.__class__.__name__
        else:
            self.name = name
        if model_path is None:
            self.model_path = os.path.join('model', self.name)
        else:
            self.model_path = model_path
        self.env = env
        self.training = training
        self.gamma = gamma
        self.lr_a = lr_a
        self.lr_a_decay = lr_a_decay
        self.lr_c = lr_c
        self.lr_c_decay = lr_c_decay
        self.epsilon = epsilon
        self.epsilon_final = epsilon_final
        self.batch_size = batch_size
        self.layer_sizes = layer_sizes
        self.grad_clip_norm = grad_clip_norm
        self.seed = seed

        self.memory = ReplayMemory(tuple_class=Record)

        self.action_size = self.env.action_space.n
        self.state_size = np.prod(list(self.env.observation_space.shape))
        print 'action_size: {a}, state_size: {s}'.format(a=self.action_size, s=self.state_size)

        if self.training:
            # clear existing model files
            if os.path.exists(self.model_path):
                print 'deleting existing model files at {}'.format(self.model_path)
                if os.path.isdir(self.model_path):
                    shutil.rmtree(self.model_path)
                else:
                    os.remove(self.model_path)

        BaseTFModel.__init__(self, self.name, self.model_path, saver_max_to_keep=5)

        print 'building graph ...'
        with self.graph.as_default():
            if self.seed is not None:
                np.random.seed(self.seed)
                tf.set_random_seed(int(self.seed/3))
            self.__build_graph()

        if act == 'bayesian':
            self.act = self.act_bayesian
        elif act == 'epsilon':
            self.act = self.act_epsilon
        else:
            raise Exception('not supported act {}'.format(act))

    def act_epsilon(self, state, **kwargs):
        """
        epsilon-greedy exploration is not effective in the case of large action spaces
        :param state:
        :param epsilon:
        :return:
        """
        if self.training and np.random.random() < kwargs['epsilon']:
            return self.env.action_space.sample()
        proba = self.sess.run(self.actor_proba, {self.states: state.reshape((1, -1))})[0]
        return np.argmax(proba)

    def act_bayesian(self, state, **kwargs):
        """
        :param state: 1d np.ndarray
        :return:
        """
        assert isinstance(state, np.ndarray) and state.ndim == 1
        # return self.sess.run(self.sampled_actions, {self.states: state.reshape((1, -1))})
        if self.training:
            return self.sess.run(self.sampled_actions, {self.states: state.reshape((1, -1))})
        else:
            return self.sess.run(self.selected_actions, {self.states: state.reshape((1, -1))})

    def __build_graph(self):
        # c: critic, a: actor
        self.learning_rate_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_c')
        self.learning_rate_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_a')

        # inputs
        self.states = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state')
        self.states_next = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state_next')
        self.actions = tf.placeholder(tf.int32, shape=(None,), name='action')
        self.rewards = tf.placeholder(tf.float32, shape=(None,), name='reward')

        # actor: action probabilities
        self.actor = dense_nn(self.states, self.layer_sizes + [self.action_size], training=self.training, name='actor')
        # integer tensor
        self.sampled_actions = tf.squeeze(tf.multinomial(self.actor, 1))
        self.selected_actions = tf.squeeze(tf.argmax(self.actor, axis=-1))
        self.actor_proba = tf.nn.softmax(self.actor)
        self.actor_vars = self.scope_vars('actor')

        # critic: action value (Q-value)
        self.critic = dense_nn(self.states, self.layer_sizes + [1], training=self.training, name='critic')
        self.critic_vars = self.scope_vars('critic')
        self.td_targets = self.rewards \
                          + self.gamma * tf.squeeze(dense_nn(self.states_next, self.layer_sizes + [1], training=self.training, name='critic', reuse=True))
        # print the shape of td_targets
        # self.td_targets = tf.Print(self.td_targets, [tf.shape(self.td_targets)], first_n=1)

        action_ohe = tf.one_hot(self.actions, self.action_size, dtype=tf.float32, name='action_one_hot')
        self.pred_value = tf.reduce_sum(self.critic * action_ohe, axis=-1, name='q_action')
        self.td_errors = tf.stop_gradient(self.td_targets) - self.pred_value

        with tf.variable_scope('critic_train'):
            # self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars])
            self.loss_c = tf.reduce_mean(tf.square(self.td_errors))  # + 0.001 * self.reg_c
            self.optim_c = tf.train.AdamOptimizer(self.learning_rate_c)
            self.grads_c = self.optim_c.compute_gradients(self.loss_c, self.critic_vars)
            if self.grad_clip_norm:
                self.grads_c = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in self.grads_c]
            self.train_op_c = self.optim_c.apply_gradients(self.grads_c)

        with tf.variable_scope('actor_train'):
            # self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars])
            self.loss_a = tf.reduce_mean(
                tf.stop_gradient(self.td_errors) * tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.actor, labels=self.actions),
                name='loss_actor')  # + 0.001 * self.reg_a
            self.optim_a = tf.train.AdamOptimizer(self.learning_rate_a)
            self.grads_a = self.optim_a.compute_gradients(self.loss_a, self.actor_vars)
            if self.grad_clip_norm:
                self.grads_a = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in self.grads_a]
            self.train_op_a = self.optim_a.apply_gradients(self.grads_a)

        with tf.variable_scope('summary'):
            self.grads_a_summ = [tf.summary.scalar('grads/a_' + var.name, tf.norm(grad)) for
                                 grad, var in self.grads_a if grad is not None]
            self.grads_c_summ = [tf.summary.scalar('grads/c_' + var.name, tf.norm(grad)) for
                                 grad, var in self.grads_c if grad is not None]
            self.loss_c_summ = tf.summary.scalar('loss/critic', self.loss_c)
            self.loss_a_summ = tf.summary.scalar('loss/actor', self.loss_a)
            self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES)

        self.train_ops = [self.train_op_a, self.train_op_c]
        self.init_vars = tf.global_variables_initializer()

    def train(self, n_episodes, annealing_episodes=None, every_episode=None, done_rewards=None, **kwargs):
        if self.training is False:
            raise Exception('prohibited to call train() for a non-training model')

        step = 0
        reward_history = []
        reward_averaged = []
        lr_c = self.lr_c
        lr_a = self.lr_a
        eps = self.epsilon
        annealing_episodes = annealing_episodes or n_episodes
        eps_drop = (eps - self.epsilon_final) / annealing_episodes
        print "eps_drop: {}".format(eps_drop)

        self.sess.run(self.init_vars)
        for n_episode in range(n_episodes):
            ob = self.env.reset()

            episode_reward = 0.
            done = False
            while not done:
                a = self.act(ob, epsilon=eps)
                ob_next, r, done, _ = self.env.step(a)
                step += 1
                episode_reward += r
                if done:
                    r = done_rewards or 0.
                self.memory.add(Record(ob, a, r, ob_next))
                ob = ob_next

                while self.memory.size >= self.batch_size:
                    batch = self.memory.pop(self.batch_size)
                    _, summ_str = self.sess.run(
                        [self.train_ops, self.merged_summary], feed_dict={
                            self.learning_rate_c: lr_c,
                            self.learning_rate_a: lr_a,
                            self.states: batch['s'],
                            self.actions: batch['a'],
                            self.rewards: batch['r'],
                            self.states_next: batch['s_next']
                        })
                    self.writer.add_summary(summ_str, step)
            reward_history.append(episode_reward)
            reward_averaged.append(np.mean(reward_history[-10:]))

            lr_c *= self.lr_c_decay
            lr_a *= self.lr_a_decay
            if eps > self.epsilon_final:
                eps -= eps_drop

            if reward_history and every_episode and n_episode % every_episode == 0:
                print(
                    "[episodes: {}/step: {}], best: {}, avg10: {:.2f}: {}, lr: {:.4f} | {:.4f} eps: {:.4f}".format(
                        n_episode, step, np.max(reward_history),
                        np.mean(reward_history[-10:]), reward_history[-5:],
                        lr_c, lr_a, eps
                    ))
        self.save_model(step=step)
        print "[training completed] episodes: {}, Max reward: {}, Average reward: {}".format(
            len(reward_history), np.max(reward_history), np.mean(reward_history))

        fig_path = os.path.join(self.model_path, 'figs')
        makedirs(fig_path)
        fig_file = os.path.join(fig_path, '{n}-{t}.png'.format(n=self.name, t=int(time.time())))
        plot_learning_curve(fig_file, {'reward': reward_history, 'reward_avg': reward_averaged}, xlabel='episode')

    def evaluate(self, n_episodes):
        if self.training:
            raise Exception('prohibited to call evaluate() for a training model')

        reward_history = []
        for episode in xrange(n_episodes):
            state = self.env.reset()
            reward_episode = 0.
            while True:
                action = self.act(state)
                new_state, reward, done, _ = self.env.step(action)
                reward_episode += reward
                state = new_state
                if done:
                    break
            reward_history.append(reward_episode)
        return reward_history