Exemplo n.º 1
0
def sac(env_fn,
        seed=0,
        gamma=.99,
        lam=.97,
        hidden_sizes=(200, 100),
        alpha=.0,
        v_lr=1e-3,
        q_lr=1e-3,
        pi_lr=1e-3,
        polyak=1e-2,
        epochs=50,
        steps_per_epoch=1000,
        batch_size=100,
        start_steps=1000,
        logger_kwargs=dict(),
        replay_size=int(1e6),
        max_ep_len=1000,
        save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    env = env_fn()

    # Dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    # act_limit = env.action_space.high[0]

    # Placeholders
    x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    a_ph = tf.placeholder(shape=[None, 1], dtype=tf.float32)
    x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    r_ph = tf.placeholder(shape=[None], dtype=tf.float32)
    d_ph = tf.placeholder(shape=[None], dtype=tf.float32)

    # Networks
    def mlp(x,
            hidden_sizes=(32, ),
            activation=tf.tanh,
            output_activation=None):
        for h in hidden_sizes[:-1]:
            x = tf.layers.dense(x, units=h, activation=activation)
        return tf.layers.dense(x,
                               units=hidden_sizes[-1],
                               activation=output_activation)

    def mlp_categorical_policy(x, a, hidden_sizes, activation,
                               output_activation, action_space):
        act_dim = action_space.n
        logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None)
        pi_all = tf.nn.softmax(logits)
        logpi_all = tf.nn.log_softmax(logits)
        # pi = tf.squeeze(tf.random.categorical(logits,1), axis=1)
        pi = tf.random.categorical(logits, 1)
        # a = tf.cast( a, tf.uint8)
        # logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
        # logp_pi = tf.reduce_sum(tf.one_hot( tf.squeeze( pi, axis=1), depth=act_dim) * logp_all, axis=1)

        return pi, pi_all, logpi_all

    LOG_STD_MIN = -20
    LOG_STD_MAX = 2

    with tf.variable_scope("main"):
        activation = tf.tanh
        with tf.variable_scope("pi"):
            pi, pi_all, logpi_all = mlp_categorical_policy(
                x_ph, a_ph, hidden_sizes, activation, None, env.action_space)

        print("### DEBUG @ main-discrete.py pi and others' dimensions")
        print(pi)
        print(pi_all)
        print(logpi_all)
        input()

        with tf.variable_scope("q1"):
            q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (act_dim, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q1", reuse=True):
            q1_pi = tf.squeeze(mlp(
                tf.concat([x_ph, tf.cast(pi, tf.float32)], axis=-1),
                hidden_sizes + (act_dim, ), activation, None),
                               axis=-1)

        with tf.variable_scope("q2"):
            q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (act_dim, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q2", reuse=True):
            q2_pi = tf.squeeze(mlp(
                tf.concat([x_ph, tf.cast(pi, tf.float32)], -1),
                hidden_sizes + (act_dim, ), activation, None),
                               axis=-1)

        with tf.variable_scope("v"):
            # v = mlp( x_ph, hidden_sizes+(1,), activation, None)
            v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None),
                           axis=-1)

    with tf.variable_scope("target"):

        with tf.variable_scope("v"):
            v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation,
                                    None),
                                axis=-1)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple(
        count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n'
        % var_counts)

    # Targets
    q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ
    v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi
    q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient(
        v_backup_prestop)

    # Q Loss
    q1_loss = tf.reduce_mean((q1 - q_backup)**2)
    q2_loss = tf.reduce_mean((q2 - q_backup)**2)
    q_loss = q1_loss + q2_loss

    # V Loss
    v_loss = tf.reduce_mean((v - v_backup)**2)

    # Pol loss
    pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi)

    # Training ops
    v_trainop = tf.train.AdamOptimizer(v_lr).minimize(
        v_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/v"))
    q_trainop = tf.train.AdamOptimizer(q_lr).minimize(
        q_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/q"))
    pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize(
        pi_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/pi"))

    assert polyak <= .5
    # Target update op
    init_v_target = tf.group([
        tf.assign(v_target, v_main) for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    update_v_target = tf.group([
        tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main)
        for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(init_v_target)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            # print( o.reshape(-1, 1))
            # input()
            while not (d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step(
                    sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0])
                ep_ret += r
                ep_len += 1

            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    #Buffer init
    buffer = ReplayBuffer(obs_dim, 1, replay_size)

    # Main loop
    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    for t in range(total_steps):
        if t > start_steps:
            a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0]
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        d = False or (ep_len == max_ep_len)

        # Still needed ?
        o2 = np.squeeze(o2)

        buffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            for j in range(ep_len):
                batch = buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                # DEBUG:
                # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict)
                # print( v_backup_prestop_out.shape)
                # print( v_backup_prestop_out)
                # input()

                # Value gradient steps
                v_step_ops = [v_loss, v, v_trainop]
                outs = sess.run(v_step_ops, feed_dict)
                logger.store(LossV=outs[0], VVals=outs[1])

                # Q Gradient steps
                q_step_ops = [q_loss, q1, q2, q_trainop]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                # Policy gradient steps
                # TODO Add entropy logging
                pi_step_ops = [pi_loss, pi_trainop, update_v_target]
                outs = sess.run(pi_step_ops, feed_dict=feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Exemplo n.º 2
0
class DQNAgent:
    def __init__(self, env, state_size, action_size, batch_size, gamma, lr,
                 update_every, tau, eps_start, eps_end, eps_decay, seed):

        for key, value in locals().items():
            if key != 'self':
                setattr(self, key, value)

        random.seed(seed)
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.Q_target = LinearModel(state_size, action_size)
        self.Q_local = LinearModel(state_size, action_size)

        self.memory = ReplayBuffer(batch_size=batch_size)
        self.optim = torch.optim.Adam(self.Q_local.parameters(), lr=lr)

        self.update_counter = 0

    def env_reset(self, train_mode=True):
        return self.env.reset()

    def env_step(self, action):
        return self.env.step(action)

    def env_render(self, train_mode=False):
        return self.env.render()

    def env_close(self, train_mode=True):
        if not train_mode:
            return self.env.close()

    def get_action(self, state, epsilon=0.):
        if random.random() < epsilon:
            return np.random.choice(np.arange(self.action_size))

        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        self.Q_local.eval()
        with torch.no_grad():
            action = np.argmax(self.Q_local(state).data.numpy())
        return action

    def step(self, state, action, reward, next_state, done):
        self.memory.store(
            (state, action, reward, next_state, 1 if done else 0))

        self.update_counter = (self.update_counter + 1) % self.update_every
        if self.update_counter == 0:
            self.update_Q()

    def update_Q(self):
        states, actions, rewards, next_states, dones = self.memory.sample()

        Q_target_next = self.Q_target(next_states).detach().max(
            dim=1, keepdim=True)[0]
        Q_target_pred = rewards + self.gamma * Q_target_next * (1.0 - dones)
        self.Q_local.eval()
        Q = self.Q_local(states).gather(1, actions)

        loss = F.mse_loss(Q, Q_target_pred)
        self.Q_local.train()
        self.Q_local.zero_grad()
        loss.backward()
        self.optim.step()

        for t_param, l_param in zip(self.Q_target.parameters(),
                                    self.Q_local.parameters()):
            t_param.data.copy_(self.tau * l_param.data +
                               (1.0 - self.tau) * t_param.data)

    def train(self, num_episodes, max_t=1000, is_finished=None, render=False):
        scores = []
        eps = self.eps_start

        for i in range(num_episodes):
            state = self.env_reset(train_mode=True)
            score = 0
            for _ in range(max_t):
                action = self.get_action(state, eps)
                if render: self.env_render(train_mode=True)
                next_state, reward, done, _ = self.env_step(action)
                self.step(state, action, reward, next_state, done)
                score += reward
                state = next_state
                if done: break

            eps = max(self.eps_end, eps * self.eps_decay)
            scores.append(score)
            if is_finished and is_finished(scores, num_episodes):
                break
        if render: self.env_close(train_mode=False)
        return scores

    def run(self, num_episodes=1, max_t=1000, render=None):
        if render == None: render = num_episodes == 1
        scores = []
        for i in range(num_episodes):
            state = self.env_reset(train_mode=False)
            score = 0
            for _ in range(max_t):
                action = self.get_action(state)
                if render: self.env_render(train_mode=False)
                next_state, reward, done, _ = self.env_step(action)
                score += reward
                state = next_state
                if done: break

            scores.append(score)
            if render: self.env_close(train_mode=False)
        return scores
class BaseStudent(SerializableAgent):
    def __init__(self,
        env                  : gym.Env  ,
        trajs_path           : str      ,
        model_path           : str      ,
        run_seed             : int      ,
        batch_size           : int      ,
        buffer_size_in_trajs : int      ,
        teacher              : BaseAgent,
    ):
        super(BaseStudent, self).__init__(
            env        = env       ,
            trajs_path = trajs_path,
            model_path = model_path,
        )

        self.run_seed             = run_seed
        self.batch_size           = batch_size
        self.buffer_size_in_trajs = buffer_size_in_trajs
        self.teacher              = teacher

        self._fill_buffer()

    def matchup(self) -> np.ndarray:
        samples = self.buffer.sample_all()
        state   = samples['state' ]
        action  = samples['action']

        action_hat = np.array([self.select_action(s) for s in state])
        match_samp =  np.equal(action, action_hat)

        return match_samp

    def rollout(self) -> Tuple[List[Tuple[np.ndarray, np.ndarray]], List[float], float]:
        state = self.env.reset()

        traj = []
        match = []
        retvrn = 0

        done = False

        while not done:
            action = self.select_action(state)
            reward, next_state, done = self.perform_action(action)

            traj += [(state, action)]
            match += [action == self.teacher.select_action(state)]
            retvrn += reward

            state = next_state

        return traj, match, retvrn

    def test(self,
        num_episodes : int,
    ) -> Tuple[float, float, float]:
        self.test_mode = True

        trajs = []
        matches = []
        returns = []

        for episode_index in range(num_episodes):
            traj, match, retvrn = self.rollout()

            trajs += [traj]
            matches += match
            returns += [retvrn]

        np.save(self.trajs_path, {'trajs': trajs, 'returns': returns})

        return np.sum(matches) / len(matches), np.mean(returns), np.std(returns)

    def serialize(self):
        raise NotImplementedError

    def deserialize(self):
        raise NotImplementedError

    def _fill_buffer(self):
        trajs = np.load(self.teacher.trajs_path, allow_pickle = True)[()] \
            ['trajs'][self.run_seed:self.run_seed + self.buffer_size_in_trajs]

        pairs = [pair for traj in trajs for i, pair in enumerate(traj) if i % 20 == 0]

        if len(pairs) < self.batch_size:
            self.batch_size = len(pairs)

        self.buffer = ReplayBuffer(
            state_dim  = self.env.observation_space.shape[0],
            total_size = len(pairs)                         ,
            batch_size = self.batch_size                    ,
        )

        for pair in pairs:
            self.buffer.store(
                state      = pair[0],
                action     = pair[1],
                reward     = None   ,
                next_state = None   ,
                done       = None   ,
            )
Exemplo n.º 4
0
def ddpg(env_fn,
         actor_critic=a2c,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=.99,
         polyak=.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=.1,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_limit = env.action_space.high[0]

    ac_kwargs['action_space'] = env.action_space

    x_ph, a_ph, x2_ph, r_ph, d_ph = \
        tf.placeholder( name='x_ph', shape=[None, obs_dim], dtype=tf.float32), \
        tf.placeholder( name='a_ph', shape=[None, act_dim], dtype=tf.float32), \
        tf.placeholder( name='x2_ph', shape=[None, obs_dim], dtype=tf.float32), \
        tf.placeholder( name='r_ph', shape=[None], dtype=tf.float32), \
        tf.placeholder( name='d_ph', shape=[None], dtype=tf.float32)

    # Main networks
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    replaybuffer = ReplayBuffer(obs_dim, act_dim, replay_size)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    var_counts = tuple(
        count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # Losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Optimizer and train ops
    train_pi_op = tf.train.AdamOptimizer(pi_lr).minimize(
        pi_loss, var_list=get_vars('main/pi'))
    train_q_op = tf.train.AdamOptimizer(q_loss).minimize(
        q_loss, var_list=get_vars('main/q'))

    # Update target networks
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Init targets
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q': q
                          })

    def get_actions(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)

        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_actions(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop:
    for t in range(total_steps):
        if t > start_steps:
            a = get_actions(o, act_noise)
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        d = False if ep_len == max_ep_len else d

        # Storing experience
        replaybuffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            for _ in range(ep_len):
                batch = replaybuffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Exemplo n.º 5
0
 def train(self):
     replay_buffer=ReplayBuffer(self.state_dim, self.act_dim, self.replay_size)
     
     total_steps = self.steps_per_epoch * self.epochs
     
     state = self.env.reset()
     state = state.astype(np.float32)
     ep_len, ep_rew, ep_count = 0,0,0
     all_ep_rew= []
     for t in range(total_steps):
         # randomly sample actions untils start_steps have elapsed
         if t > self.start_steps:
             act = self.sample_action(state)
             act = act.numpy()
         else:
             act = self.env.action_space.sample()
         
         state_, r, d, _ = self.env.step(act)
         state_ = state_.astype(np.float32)
         d = False if ep_len==self.max_ep_len else d
         ep_len+=1
         ep_rew+=r
         
         # Store transitions
         replay_buffer.store(state,act,r,state_,d)
         
         state = state_
         
         # End of trajectory
         if d or (ep_len==self.max_ep_len):
             state = self.env.reset()
             state = state.astype(np.float32)
             
             if len(all_ep_rew) < 5:
                 all_ep_rew.append(ep_rew)
             else:
                 all_ep_rew.append(ep_rew)
                 all_ep_rew[-1] = (np.mean(all_ep_rew[-5:]))  # smoothing
             epoch=(t+1)//self.steps_per_epoch
             print("Training | Epoch:{} | Episode:{}  | Steps: {}/{} | Episode Reward: {:.4f}".format(epoch, ep_count, t, total_steps, ep_rew))
             
             ep_len, ep_rew = 0,0
             ep_count += 1
         
         # Update
         if t>self.update_after and t%self.update_every==0:
             for j in range(self.update_every):
                 batch=replay_buffer.sample_batch(self.batch_size)
                 self.update_critic(batch)
                 
                 if j%self.policy_delay==0:
                     self.update_actor(batch)
                     self.update_targets()
         
         # End of epoch
         if (t+1) % self.steps_per_epoch==0:
             epoch=(t+1)//self.steps_per_epoch
             
             # save model
             if (epoch % self.save_freq == 0) or (epoch == self.epochs):
                 self.actor.save_weights(self.save_path+'actor_checkpoint'+str(epoch))
                 self.critic_net1.save_weights(self.save_path+'critic_net1_checkpoint'+str(epoch))
                 self.critic_net2.save_weights(self.save_path+'critic_net2_checkpoint'+str(epoch))
     plt.figure()
     plt.plot(all_ep_rew)
     plt.xlabel('episodes')
     plt.ylabel('total reward per episode')
     plt.show()
Exemplo n.º 6
0
def train(env_fn,
          env_name,
          ac_kwargs=dict(),
          seed=0,
          steps_per_epoch=1000,
          epochs=3000,
          replay_size=int(1e6),
          gamma=0.99,
          polyak=0.995,
          lr=3e-4,
          batch_size=64,
          start_steps=10000,
          update_after=10000,
          update_every=1,
          num_test_episodes=10,
          value_coef=0.5,
          entropy_coef=0.02,
          max_ep_len=1000,
          logger_kwargs=dict(),
          save_freq=10,
          device=torch.device('cpu')):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    env.seed(seed)
    test_env.seed(seed)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    actor_critic = MLPActorCritic(env.observation_space, env.action_space,
                                  **ac_kwargs).to(device)
    sql = SQL(actor_critic, lr, batch_size, update_every, gamma, polyak,
              value_coef, entropy_coef)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 device=device)

    rewards_log = []
    episode_rewards = deque(maxlen=10)

    # Set up model saving
    logger.setup_pytorch_saver(sql.actor_critic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                action = sql.actor_critic.act(
                    torch.as_tensor(o, dtype=torch.float32).to(device))
                o, r, d, _ = test_env.step(action)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            episode_rewards.append(ep_ret)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps:
            a = sql.actor_critic.act(
                torch.as_tensor(o, dtype=torch.float32).to(device))
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after:
            for _ in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                loss = sql.update(data=batch)
                logger.store(Loss=loss)
        else:
            logger.store(Loss=0.)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if save_freq != 0 and ((epoch % save_freq == 0) or
                                   (epoch == epochs)):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            rewards_log.append(np.mean(episode_rewards))

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Time', time.time() - start_time)
            logger.log_tabular('Loss', average_only=True)
            logger.dump_tabular()

    rewards_log = np.array(rewards_log)
    save_path = '../../log/modified_sql/' + env_name + '/' + str(seed) + '.npy'
    np.save(save_path, rewards_log)
Exemplo n.º 7
0
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000,
    epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3,
    batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5,
    policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1):

    logger = EpochLogger( **logger_kwargs)
    logger.save_config( locals())

    tf.set_random_seed(seed)
    np.random.seed( seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping
    act_limit = env.action_space.high[0]

    # Share action sapce info with A2C
    ac_kwargs['action_space'] = env.action_space

    x_ph, a_ph, x2_ph, r_ph, d_ph = \
        tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \
        tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \
        tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\
        tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \
        tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32)

    # Actor policy and value
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs)

    # Tghis seems a bit memory inneficient: what happens to the q values created
    # along with the target policy ? the poluicy created along the q targets ?
    # Not referenced, but still declared right, a the cost of GPU memory
    # Target policy
    with tf.variable_scope( 'target'):
        pi_targ, _, _, _  = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope( 'target', reuse=True):
        epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value( a2, -act_limit, act_limit)

        # Target Q-Values using actions from target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple( count_vars( scope) for scope in ['main/pi',
        'main/q1', 'main/q2', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts)

    # CLiped Double Q-Learning with Bellman backup
    min_q_targ = tf.minimum( q1_targ, q2_targ)
    backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ)

    # TD3 Losses
    pi_loss = - tf.reduce_mean( q1_pi)
    q1_loss = tf.reduce_mean( (q1 - backup)**2)
    q2_loss = tf.reduce_mean( (q2 - backup)**2)
    q_loss = q1_loss + q2_loss

    # Trainin ops
    pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss)
    q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss)

    # Polyak wise target update
    target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak)
        * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))])

    target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in
        zip( get_vars('target'), get_vars('main'))])

    sess = tf.Session()
    sess.run( tf.global_variables_initializer())
    sess.run( target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2})

    def get_action( o, noise_scale):
        a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)})
        a += noise_scale * np.random.randn( act_dim)

        return np.clip( a, -act_limit, act_limit)

    def test_agent( n=10):
        for j in range( n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0
            while not ( d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step( get_action( o, 0))
                ep_ret += r
                ep_len += 1

            logger.store( TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0
    total_steps = steps_per_epoch * epochs

    # Main loop
    for t in range( total_steps):
        if t > start_steps:
            a = get_action( o, act_noise)
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step( a)
        ep_ret += r
        ep_len += 1

        d = False or ( ep_len == max_ep_len)

        o2 = np.squeeze( o2)

        # print( "O2: ", o2)
        replaybuffer.store( o, a, r, o2, d)

        o = o2

        if d or ( ep_len == max_ep_len):
            for j in range( ep_len):
                batch = replaybuffer.sample_batch( batch_size)
                feed_dict = {x_ph: batch['obs1'],
                                 x2_ph: batch['obs2'],
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done']
                                }
                q_step_ops = [q_loss, q1, q2, q_train]
                outs = sess.run( q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    outs = sess.run( [pi_loss, pi_train, target_update], feed_dict)
                    logger.store( LossPi=outs[0])

            logger.store( EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or ( epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
Exemplo n.º 8
0
               target_update_freq, gamma, explor_period, seed, env)

env.reset()

#######
#prefill buffer

prefill_buffer_size = 50000
buffer.reset()

for _ in range(prefill_buffer_size):

    action = np.random.randint(0, len(env.action_Space))
    current_state = np.copy(env.state)
    next_state, reward, done = env.step(action)
    buffer.store(current_state, action, reward, done, prefill=True)

    if done:
        env.reset()

#reset when prefilling is done
env.reset()

###########

#train the network
dqn.training = True

training_steps = 10000
total_reward = 0
Exemplo n.º 9
0
def sac(env_fn,
        seed=0,
        gamma=.99,
        lam=.97,
        hidden_sizes=(200, 100),
        alpha=.5,
        v_lr=1e-3,
        q_lr=1e-3,
        pi_lr=1e-3,
        polyak=1e-2,
        epochs=50,
        steps_per_epoch=1000,
        batch_size=100,
        start_steps=10000,
        logger_kwargs=dict(),
        replay_size=int(1e6),
        max_ep_len=1000,
        save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    env = env_fn()

    # Dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_limit = env.action_space.high[0]

    # Placeholders
    x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32)
    x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    r_ph = tf.placeholder(shape=[None], dtype=tf.float32)
    d_ph = tf.placeholder(shape=[None], dtype=tf.float32)

    # Networks
    def mlp(x,
            hidden_sizes=(32, ),
            activation=tf.tanh,
            output_activation=None):
        for h in hidden_sizes[:-1]:
            x = tf.layers.dense(x, units=h, activation=activation)
        return tf.layers.dense(x,
                               units=hidden_sizes[-1],
                               activation=output_activation)

    # Why isn't the k used here ?
    def gaussian_likelihood(x, mu, log_std):
        EPS = 1e-8
        pre_sum = -0.5 * (
            ((x - mu) /
             (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi))
        return tf.reduce_sum(pre_sum, axis=1)

    def clip_but_pass_gradient(x, l=-1., u=1.):
        clip_up = tf.cast(x > u, tf.float32)
        clip_low = tf.cast(x < l, tf.float32)
        return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low)

    LOG_STD_MIN = -20
    LOG_STD_MAX = 2

    def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
        act_dim = a.shape.as_list()[-1]
        net = mlp(x, list(hidden_sizes), activation, activation)
        mu = tf.layers.dense(net, act_dim, activation=output_activation)
        """
        Because algorithm maximizes trade-off of reward and entropy,
        entropy must be unique to state---and therefore log_stds need
        to be a neural network output instead of a shared-across-states
        learnable parameter vector. But for deep Relu and other nets,
        simply sticking an activationless dense layer at the end would
        be quite bad---at the beginning of training, a randomly initialized
        net could produce extremely large values for the log_stds, which
        would result in some actions being either entirely deterministic
        or too random to come back to earth. Either of these introduces
        numerical instability which could break the algorithm. To
        protect against that, we'll constrain the output range of the
        log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is
        slightly different from the trick used by the original authors of
        SAC---they used tf.clip_by_value instead of squashing and rescaling.
        I prefer this approach because it allows gradient propagation
        through log_std where clipping wouldn't, but I don't know if
        it makes much of a difference.
        """
        log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std +
                                                                     1)

        std = tf.exp(log_std)
        pi = mu + tf.random_normal(tf.shape(mu)) * std
        logp_pi = gaussian_likelihood(pi, mu, log_std)
        return mu, pi, logp_pi

    def apply_squashing_func(mu, pi, logp_pi):
        mu = tf.tanh(mu)
        pi = tf.tanh(pi)
        # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
        logp_pi -= tf.reduce_sum(
            tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
        return mu, pi, logp_pi

    with tf.variable_scope("main"):
        activation = tf.tanh
        with tf.variable_scope("pi"):
            # mu = mlp( x_ph, hidden_sizes, activation, None)
            # log_std = mlp( mu, (act_dim,), activation, None)
            # # Avoid out of range log_std. Refer to Github for explanation.
            # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
            #
            # mu = mlp( mu, (act_dim,), activation, None)
            #
            # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu))
            # logp_pi = gaussian_likelihood( pi, mu, log_std)
            #
            # # Follow SpinningUp Implementation
            # mu = tf.tanh(mu)
            # pi = tf.tanh(pi)
            #
            # def clip_but_pass_gradient(x, l=-1., u=1.):
            #     clip_up = tf.cast(x > u, tf.float32)
            #     clip_low = tf.cast(x < l, tf.float32)
            #     # What is this supposed to mean even ?
            #     return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)
            #
            # # Shameless copy paste
            # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)

            # Not working version bak
            # squashed_pi = tf.tanh( pi)
            #
            # # To be sure
            # pi = tf.clip_by_value( pi, -act_limit, act_limit)
            #
            # # Must take in the squased polic
            # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std)

            # Shamefull plug
            mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes,
                                                  tf.tanh, None)
            mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)

        with tf.variable_scope("q1"):
            q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (1, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q1", reuse=True):
            q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1),
                                   hidden_sizes + (1, ), activation, None),
                               axis=-1)

        with tf.variable_scope("q2"):
            q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (1, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q2", reuse=True):
            q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1),
                                   hidden_sizes + (1, ), activation, None),
                               axis=-1)

        with tf.variable_scope("v"):
            # v = mlp( x_ph, hidden_sizes+(1,), activation, None)
            v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None),
                           axis=-1)

    with tf.variable_scope("target"):

        with tf.variable_scope("v"):
            v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation,
                                    None),
                                axis=-1)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple(
        count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n'
        % var_counts)

    # Targets
    q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ
    v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi
    q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient(
        v_backup_prestop)

    # Q Loss
    q1_loss = tf.reduce_mean((q1 - q_backup)**2)
    q2_loss = tf.reduce_mean((q2 - q_backup)**2)
    q_loss = q1_loss + q2_loss

    # V Loss
    v_loss = tf.reduce_mean((v - v_backup)**2)

    # Pol loss
    pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi)

    # Training ops
    v_trainop = tf.train.AdamOptimizer(v_lr).minimize(
        v_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/v"))
    q_trainop = tf.train.AdamOptimizer(q_lr).minimize(
        q_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/q"))
    pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize(
        pi_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/pi"))

    assert polyak <= .5
    # Target update op
    init_v_target = tf.group([
        tf.assign(v_target, v_main) for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    update_v_target = tf.group([
        tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main)
        for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(init_v_target)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            # print( o.reshape(-1, 1))
            # input()
            while not (d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step(
                    sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}))
                ep_ret += r
                ep_len += 1

            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    #Buffer init
    buffer = ReplayBuffer(obs_dim, act_dim, replay_size)

    # Main loop
    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    for t in range(total_steps):
        if t > start_steps:
            a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        o2, r, d, _ = env.step(o)

        d = False or (ep_len == max_ep_len)

        # Still needed ?
        o2 = np.squeeze(o2)

        buffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            for j in range(ep_len):
                batch = buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                # DEBUG:
                # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict)
                # print( v_backup_prestop_out.shape)
                # print( v_backup_prestop_out)
                # input()

                # Value gradient steps
                v_step_ops = [v_loss, v, v_trainop]
                outs = sess.run(v_step_ops, feed_dict)
                logger.store(LossV=outs[0], VVals=outs[1])

                # Q Gradient steps
                q_step_ops = [q_loss, q1, q2, q_trainop]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                # Policy gradient steps
                # TODO Add entropy logging
                pi_step_ops = [pi_loss, pi_trainop, update_v_target]
                outs = sess.run(pi_step_ops, feed_dict=feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()