예제 #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)
    args = parser.parse_args()

    env = gym.make(args.environment)
    unroll = 20

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound_high = env.action_space.high
    action_bound_low = env.action_space.low

    agent = direct_policy_search(state_dim, action_dim, action_bound_high,
                                 action_bound_low, unroll, .9, 5,
                                 'direct_policy_search')

    # Replay memory
    memory = Memory(args.replay_mem_size)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            #env.render()
            action = agent.act(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Store tuple in replay memory
            memory.add([
                np.atleast_2d(state),
                np.atleast_2d(action), reward,
                np.atleast_2d(next_state), done
            ])

            # Training step
            batch = np.array(memory.sample(args.batch_size))
            assert len(batch) > 0
            states = np.concatenate(batch[:, 0], axis=0)

            # Train the agent
            agent.train(sess, states)

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', unroll
                epoch += 1
                total_rewards = 0.
                state = env.reset()
예제 #2
0
class DDPG:
    def __init__(self, env, batch_size=32, gamma=0.99, 
                 hidden_units=32, maxlen=10000, 
                 tau=0.1, actor_lr=0.001, critic_lr=0.001):
        
        self.env=env
        self.batch_size=batch_size
        self.gamma=gamma
        self.maxlen=maxlen
        
        self.sess=tf.Session()
           
        
        self.actor=Actor(env, self.sess, hidden_units, tau, actor_lr)
        self.critic=Critic(env, self.sess, hidden_units, tau, critic_lr)
        self.memory=Memory(maxlen)
        
        self.sess.run(tf.global_variables_initializer())
        
        self.step=0
        
    def store(self, exp):
        self.memory.add(exp)
        
    def update(self, ):
        if len(self.memory.buffer)<1000:#self.batch_size:
            return
        
        self.step+=1
        
        data = self.memory.sample(self.batch_size)
        s=np.array([d[0] for d in data])
        a=np.array([d[1] for d in data])
        r=np.array([d[2] for d in data])
        s_=np.array([d[3] for d in data])
        
        a_=self.actor.target_model.predict(s_)
        target_q=self.critic.target_model.predict([s_, a_])
        #y=np.array([d[2] for d in data])
        #for i in range(self.batch_size):
        #    y[i]+=self.gamma*target_q[i]
        y=r[:,np.newaxis]+self.gamma*target_q   
        self.critic.model.train_on_batch([s, a], y)
        
        action=self.actor.model.predict(s)     
        grads=self.critic.get_grads(s, action)
        self.actor.train(s,grads)
        
        if self.step%10==0:
            self.actor.update_weights()
            self.critic.update_weights()
        
        
    def get_action(self, s):
        return self.actor.get_action(s)
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    #parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument('--tau',
                        type=float,
                        help='soft target update parameter',
                        default=0.01)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)

    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    parser.add_argument("--model", type=str, default='gan')

    parser.add_argument("--mode", type=str, default='none')
    args = parser.parse_args()

    assert args.mode in ['none', 'test', 'transfer']
    assert args.model in [
        'mlp', 'gan', 'gated', 'dmlac_mlp', 'dmlac_gan', 'dmlac_gated',
        'ddpg_unrolled_pg_mlp', 'dmlac_gp', 'dmlac_truth', 'mpc'
    ]
    if args.model == 'dmlac_truth':
        assert args.environment == 'Pendulum-v0'
    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound_high = env.action_space.high
    args.action_bound_low = env.action_space.low

    assert len(args.action_bound_high) == len(args.action_bound_low)
    for i in range(len(args.action_bound_high)):
        assert args.action_bound_high[i] == -args.action_bound_low[i]
    print(args)

    jointddpg, update_target_actor, update_target_critic, copy_target_actor, copy_target_critic = init_model(
        [None, args.state_dim], args.action_dim, args.latent_size,
        args.learning_rate, args.action_bound_low, args.action_bound_high,
        args.tau, args.model)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    exploration_strategy = OUStrategy(jointddpg, env)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #sess.run(copy_target_critic)
        #sess.run(copy_target_actor)

        if args.mode in ['test', 'transfer']:
            env.seed(1)
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            env.render()
            # Choose an action
            exploration = (float(args.time_steps - time_steps) /
                           float(args.time_steps))**4
            action = exploration_strategy.action(sess, state[np.newaxis, ...],
                                                 exploration)
            # Execute action
            state1, reward, done, _ = env.step(action)

            total_rewards += float(reward)
            # Store tuple in replay memory
            memory.add([
                state[np.newaxis, ...], action[np.newaxis, ...], reward,
                state1[np.newaxis, ...], done
            ])

            # Training step
            batch_B = np.array(memory.sample(args.batch_size))
            assert len(batch_B) > 0
            states_B = np.concatenate(batch_B[:, 0], axis=0)
            actions_B = np.concatenate(batch_B[:, 1], axis=0)
            rewards_B = batch_B[:, 2]
            states1_B = np.concatenate(batch_B[:, 3], axis=0)
            dones_B = batch_B[:, 4]

            #Get another batch
            batch_M = np.array(memory.sample(args.batch_size))
            assert len(batch_M) > 0
            states_M = np.vstack(batch_M[:, 0])
            actions_M = np.concatenate(batch_M[:, 1], axis=0)

            if args.model == 'dmlac_gp':
                jointddpg.update_hist(memory)

            jointddpg.train(sess, states_B, actions_B, rewards_B,
                            states1_B, dones_B, states_M, actions_M,
                            len(batch_M), args.latent_size)

            # Update target networks
            #jointddpg.update(self, sess, update_target_critic, update_target_actor)
            #sess.run(update_target_critic)
            #sess.run(update_target_actor)

            state = np.copy(state1)
            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards
                epoch += 1
                total_rewards = 0.
                if args.mode == 'transfer':
                    if time_steps >= args.time_steps / 3:
                        env.seed(0)
                    else:
                        env.seed(1)
                elif args.mode == 'test':
                    env.seed(1)
                state = env.reset()
            if args.mode == 'transfer':
                if time_steps == args.time_steps / 3:
                    memory = Memory(args.replay_mem_size)
class DDPGagent:
    def __init__(self, hidden_size, env):
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        self.Actor = Actor(input_size=self.num_states,
                           hidden_size=hidden_size,
                           output_size=self.num_actions).cuda()

        self.Actor_target = Actor(input_size=self.num_states,
                                  hidden_size=hidden_size,
                                  output_size=self.num_actions).cuda()

        self.Critic = Critic(input_size=self.num_states,
                             hidden_size=hidden_size,
                             output_size=self.num_actions).cuda()

        self.Critic_target = Critic(input_size=self.num_states,
                                    hidden_size=hidden_size,
                                    output_size=self.num_actions).cuda()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = param.data

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data = param.data

        self.Memory = Memory(30000)
        self.criterion = nn.MSELoss().cuda()
        self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(),
                                                lr=1e-2)
        self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(),
                                                 lr=1e-1)

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).cuda()
        action = self.Actor.forward(state)
        action = action.detach().cpu().numpy()
        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.Memory.sample(
            batch_size)
        states = torch.tensor(states).cuda()
        actions = torch.tensor(actions).cuda()
        rewards = torch.tensor(rewards).cuda()
        next_states = torch.tensor(next_states).cuda()

        Q_Value = self.Critic.forward(states, action=actions)
        next_actions = self.Actor_target(next_states)
        next_Q = self.Critic_target.forward(next_states, next_actions.detach())
        Q_prime = rewards + 0.99 * next_Q
        critic_loss = self.criterion(Q_Value, Q_prime)
        policy_loss = -self.Critic.forward(states,
                                           self.Actor.forward(states)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = (param.data * 1e-2 + target_param.data *
                                 (1.0 - 1e-2))

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data.copy_(param.data * 1e-2 + target_param.data *
                                    (1.0 - 1e-2))
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-interface", type=str, default='gym!atari')
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=30000)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)

    parser.add_argument("--K",
                        type=int,
                        default=1,
                        help='The number of steps to train the environment')
    parser.add_argument(
        "--L",
        type=int,
        default=1,
        help='The number of Q-learning steps for hypothetical rollouts')
    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    args = parser.parse_args()

    env = env_interface(args.env_interface,
                        args.environment,
                        pixel_feature=False,
                        render=True)

    #args.action_size = env.action_space.n
    args.action_size = env.action_size
    args.input_shape = [None] + list(env.obs_space_shape)

    print args

    # Other parameters
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the GANs
    cgan_state = CGAN(input_shape=args.input_shape,
                      action_size=args.action_size,
                      latent_size=args.latent_size,
                      gen_input_shape=args.input_shape)
    cgan_reward = CGAN(input_shape=args.input_shape,
                       action_size=args.action_size,
                       latent_size=args.latent_size,
                       gen_input_shape=[None, 1])

    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    target_qnet = qnetwork(input_shape=args.input_shape,
                           action_size=args.action_size,
                           scope='target_qnet')
    update_ops = update_target_graph('qnet', 'target_qnet')

    rand_no = np.random.rand()
    #env = gym.wrappers.Monitor(env, '/tmp/cartpole-experiment-' + str(rand_no), force=True, video_callable=False)
    init = tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(args.epochs):
            total_reward = 0
            observation = env.reset()
            for t in range(1000000):
                #env.render()
                action = qnet.get_action(sess, observation)
                if np.random.rand() < epsilon:
                    #action = env.action_space.sample()
                    action = np.random.randint(args.action_size)
                observation1, reward, done, info = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([observation, action, reward, observation1, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.learning_rate, target_qnet)

                # Training step: environment model
                for k in range(args.K):
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.vstack(batch[:, 0])
                    actions = np.array(batch[:, 1])
                    rewards = batch[:, 2]
                    states1 = np.vstack(batch[:, 3])

                    _, D_loss_state = sess.run(
                        [cgan_state.D_solver, cgan_state.D_loss],
                        feed_dict={
                            cgan_state.states: states,
                            cgan_state.actions: actions,
                            cgan_state.Z: sample_z(len(batch),
                                                   args.latent_size),
                            cgan_state.X: states1
                        })
                    _, G_loss_state = sess.run(
                        [cgan_state.G_solver, cgan_state.G_loss],
                        feed_dict={
                            cgan_state.states: states,
                            cgan_state.actions: actions,
                            cgan_state.Z: sample_z(len(batch),
                                                   args.latent_size)
                        })

                    _, D_loss_reward = sess.run(
                        [cgan_reward.D_solver, cgan_reward.D_loss],
                        feed_dict={
                            cgan_reward.states: states,
                            cgan_reward.actions: actions,
                            cgan_reward.Z: sample_z(len(batch),
                                                    args.latent_size),
                            cgan_reward.X: rewards[..., np.newaxis]
                        })
                    _, G_loss_reward = sess.run(
                        [cgan_reward.G_solver, cgan_reward.G_loss],
                        feed_dict={
                            cgan_reward.states: states,
                            cgan_reward.actions: actions,
                            cgan_reward.Z: sample_z(len(batch),
                                                    args.latent_size)
                        })
                    #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state

                # Training step: imagination rollouts
                if time_step == 0.:
                    print "time_step 0 here"
                if time_step >= 0.:
                    for l in range(args.L):
                        batch = np.array(memory.sample(args.batch_size))
                        assert len(batch) > 0

                        states1 = np.vstack(batch[:, 3])
                        actions = np.random.randint(args.action_size,
                                                    size=len(batch))
                        dones = np.array([False] * len(batch))

                        G_sample_state = sess.run(cgan_state.G_sample,
                                                  feed_dict={
                                                      cgan_state.states:
                                                      states1,
                                                      cgan_state.actions:
                                                      actions,
                                                      cgan_state.Z:
                                                      sample_z(
                                                          len(batch),
                                                          args.latent_size)
                                                  })
                        G_sample_reward = sess.run(cgan_reward.G_sample,
                                                   feed_dict={
                                                       cgan_reward.states:
                                                       states1,
                                                       cgan_reward.actions:
                                                       actions,
                                                       cgan_reward.Z:
                                                       sample_z(
                                                           len(batch),
                                                           args.latent_size)
                                                   })
                        qnet.train(sess, None, args.learning_rate, target_qnet,
                                   states1, actions, G_sample_reward,
                                   G_sample_state, dones)

                # Set observation
                observation = observation1

                # Update?
                if int(time_step) % args.target_update_freq == 0:
                    #print "Updating target..."
                    sess.run(update_ops)

                if done:
                    print "Episode finished after {} timesteps".format(
                        t + 1), 'epoch', epoch, 'total_rewards', total_reward
                    break
예제 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment",
                        type=str,
                        default='MountainCarContinuous-v0')
    parser.add_argument("--unroll-steps", type=int, default=20)
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--discount-factor", type=float, default=1.)
    parser.add_argument("--goal-position", type=float, default=.45)
    args = parser.parse_args()

    env = gym.make(args.environment)
    env.seed(seed=args.goal_position)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound_high = env.action_space.high
    action_bound_low = env.action_space.low

    agent = direct_policy_search(state_dim, action_dim, action_bound_high,
                                 action_bound_low, args.unroll_steps,
                                 args.discount_factor, 1,
                                 'direct_policy_search')

    # Replay memory
    memory = Memory(args.replay_mem_size)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #weights = pickle.load(open('../custom_environments/weights/pendulum_reward.p', 'rb'))
        #weights = pickle.load(open('../custom_environments/weights/mountain_car_continuous_reward'+str(args.goal_position)+'.p', 'rb'))
        #sess.run(agent.assign_ops0, feed_dict=dict(zip(agent.placeholders_reward, weights)))
        weights = pickle.load(
            open(
                '../custom_environments/weights/mountain_car_continuous_next_state.p',
                'rb'))
        sess.run(agent.assign_ops1,
                 feed_dict=dict(zip(agent.placeholders_state, weights)))
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            env.render()
            action = agent.act(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Store tuple in replay memory
            memory.add([
                np.atleast_2d(state),
                np.atleast_2d(action), reward,
                np.atleast_2d(next_state), done
            ])

            # Training step
            batch = np.array(memory.sample(args.batch_size))
            assert len(batch) > 0
            states = np.concatenate(batch[:, 0], axis=0)

            # Train the agent
            agent.train(sess, states)

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', args.unroll_steps
                epoch += 1
                total_rewards = 0.
                state = env.reset()
예제 #7
0
파일: ddpg2.py 프로젝트: tsetimmy/new_drl
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument('--tau',
                        type=float,
                        help='soft target update parameter',
                        default=0.01)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)

    parser.add_argument("--mode", type=str, default='none')
    args = parser.parse_args()
    assert args.mode in ['none', 'test', 'transfer']

    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound_high = env.action_space.high
    args.action_bound_low = env.action_space.low

    assert len(args.action_bound_high) == len(args.action_bound_low)
    for i in range(len(args.action_bound_high)):
        assert args.action_bound_high[i] == -args.action_bound_low[i]
    print(args)
    print(sys.argv)

    # Networks
    ddpg = actorcritic(state_shape=[None, args.state_dim],
                       action_shape=[None, args.action_dim],
                       output_bound_low=args.action_bound_low,
                       output_bound_high=args.action_bound_high,
                       learning_rate=args.learning_rate,
                       tau=args.tau)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    exploration_strategy = OUStrategy(ddpg, env)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ddpg.copy_target(sess)

        time_steps = 0.
        for epoch in range(args.epochs):
            state = env.reset()
            total_rewards = 0.0
            ts = 0
            while True:
                #env.render()
                # Choose an action
                time_steps += 1.
                ts += 1
                if time_steps >= args.time_steps:
                    exploration = 0.
                else:
                    exploration = (float(args.time_steps - time_steps) /
                                   float(args.time_steps))**4
                action = exploration_strategy.action(sess, state[np.newaxis,
                                                                 ...],
                                                     exploration)
                # Execute action
                state1, reward, done, _ = env.step(action)
                total_rewards += float(reward)
                # Store tuple in replay memory
                memory.add([
                    state[np.newaxis, ...], action[np.newaxis, ...], reward,
                    state1[np.newaxis, ...], done
                ])

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                assert len(batch) > 0
                states = np.concatenate(batch[:, 0], axis=0)
                actions = np.concatenate(batch[:, 1], axis=0)
                rewards = batch[:, 2]
                states1 = np.concatenate(batch[:, 3], axis=0)
                dones = batch[:, 4]

                ddpg.train(sess, states, actions, rewards, states1, dones)

                # Update target networks
                ddpg.update_target(sess)

                state = state1.copy()
                if done == True:
                    print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'epoch ts:', ts
                    break
예제 #8
0
class Agent():    
    def __init__(self, state_size, action_size, random_seed):
        """
        Args:
        ======
            state_size (int): state dim
            action_size (int): action dim
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # actor net initialization
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # critic net initialization
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck Exploration Noise Process
        self.noise = OUNoise(action_space=action_size, seed=random_seed)

        # Replay memory init
        self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, states, actions, rewards, next_states, dones, is_learning_step, saving_wrong_step_prob = 0.9):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            if reward> 0 or random.uniform(0,1) <= saving_wrong_step_prob:
                self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and is_learning_step:
            for _ in range(10):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """map action to state"""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.evolve_state()
        return np.clip(action, -1, 1)
    
    def act_on_all_agents(self, states):
        """map action to state to all agents"""
        vectorized_act = np.vectorize(self.act, excluded='self', signature='(n),()->(k)')
        return vectorized_act(states, True)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update actor and critic nets parameters

        Args:
        ======
            experiences (Tuple[torch.Tensor]): experience tuples 
            gamma (float): bellman discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        #Soft update model parameters
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
예제 #9
0
def DQN():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--discount-factor", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=1000)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    args = parser.parse_args()

    env = Environment()
    args.action_size = env.nActions
    args.input_shape = [None, env.stateShape]

    print args

    # Epsilon parameter
    epsilon = 0.1  # args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the agent
    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    tnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='tnet')
    update_ops = update_target_graph('qnet', 'tnet')

    rewardHistory = np.zeros(args.epochs)
    env.render()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(args.epochs):
            total_reward = 0
            state = env.reset()
            while (True):
                #env.render()
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    action = qnet.act(sess, state)
                [next_state, reward, done] = env.step(action)
                total_reward += reward
                rewardHistory[epoch] += reward

                # Add to memory
                memory.add([state, action, reward, next_state, done])

                # Reduce epsilon
                time_step += 1.
                #epsilon = args.epsilon_min + (args.epsilon_max - args.epsilon_min) * np.exp(-args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.discount_factor, tnet)

                # s <- s'
                state = np.copy(next_state)

                # Update target network
                if int(time_step) % args.target_update_freq == 0:
                    sess.run(update_ops)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
        '''
        np.set_printoptions(threshold=np.nan)
        for v in range(-5, 5):
            policy = np.zeros((env.W, env.W), dtype='int')
            for x in range(env.W):
                for y in range(env.W):
                    policy[x,y] = qnet.act(sess, np.array([x,y,1,v]))
            print(policy)
        '''
        plt.xlabel('episode #')
        plt.ylabel('reward')
        plt.plot(rewardHistory)
        plt.savefig("DQN")
        plt.show()

        for epoch in range(10):
            total_reward = 0
            state = env.reset()
            while (True):
                env.render()
                action = qnet.act(sess, state)
                [next_state, reward, done] = env.step(action)
                total_reward += reward
                rewardHistory[epoch] += reward

                # Reduce epsilon
                time_step += 1.
                # s <- s'
                state = np.copy(next_state)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
예제 #10
0
class SACagent:
    def __init__(self,
                 state_dim,
                 action_dim=2,
                 hidden_dim=256,
                 lr=1e-3,
                 gamma=0.99,
                 tau=1e-2,
                 max_memory_size=1000000,
                 action_scales=[.22, .5],
                 maxTemp=0.5,
                 minTemp=0.1,
                 tempTimeScale=500000.0):
        # use cuda?
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        self.device = device
        # Params
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.maxTemp = maxTemp
        self.minTemp = minTemp
        self.tempTimeScale = tempTimeScale
        self.action_scales = action_scales

        # Memory replay buffer
        self.memory = Memory(max_memory_size)

        # Initialize all networks
        self.value_net = ValueNetwork(self.state_dim,
                                      self.hidden_dim).to(device)
        self.target_value_net = ValueNetwork(self.state_dim,
                                             self.hidden_dim).to(device)
        self.soft_q_net1 = SoftQNetwork(self.state_dim, self.action_dim,
                                        self.hidden_dim).to(device)
        self.soft_q_net2 = SoftQNetwork(self.state_dim, self.action_dim,
                                        self.hidden_dim).to(device)
        self.policy_net = PolicyNetwork(self.state_dim, self.action_dim,
                                        self.hidden_dim, self.action_scales,
                                        device).to(device)

        # Copy initial parameters from value net to target value net
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param.data)

        self.value_criterion = nn.MSELoss()
        self.soft_q_criterion1 = nn.MSELoss()
        self.soft_q_criterion2 = nn.MSELoss()

        value_lr = self.lr  #3e-4
        soft_q_lr = self.lr  #3e-4
        policy_lr = self.lr  #3e-4

        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=value_lr)
        self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(),
                                            lr=soft_q_lr)
        self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(),
                                            lr=soft_q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

    def get_action(self, state):
        action = self.policy_net.get_action(state)
        action = action.detach().numpy()
        return action

    def update(self, batch_size, t):
        print("STARTING NETWORK UPDATES")
        #state, action, reward, next_state, done = replay_buffer.sample(batch_size)
        state, action, reward, next_state = self.memory.sample(batch_size)

        state = torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).to(self.device)
        #done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)

        predicted_q_value1 = self.soft_q_net1(state, action)
        predicted_q_value2 = self.soft_q_net2(state, action)
        predicted_value = self.value_net(state)
        new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate(
            state)
        log_prob_sum = torch.sum(log_prob, dim=1)
        joint_entropy = log_prob_sum.unsqueeze(1)

        # Training Q Function
        target_value = self.target_value_net(next_state)
        target_q_value = reward + self.gamma * target_value
        q_value_loss1 = self.soft_q_criterion1(predicted_q_value1,
                                               target_q_value.detach())
        q_value_loss2 = self.soft_q_criterion2(predicted_q_value2,
                                               target_q_value.detach())
        print("Q1 LOSS = " + str(q_value_loss1) + "   Q2 LOSS = " +
              str(q_value_loss2))

        self.soft_q_optimizer1.zero_grad()
        q_value_loss1.backward()
        self.soft_q_optimizer1.step()
        self.soft_q_optimizer2.zero_grad()
        q_value_loss2.backward()
        self.soft_q_optimizer2.step()
        # Training Value Function
        predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),
                                          self.soft_q_net2(state, new_action))
        #target_value_func = predicted_new_q_value - log_prob
        alpha = max(self.minTemp,
                    (self.maxTemp - self.maxTemp * (t / self.tempTimeScale)))
        print("ALPHA = " + str(alpha) + " min = " + str(self.minTemp) +
              " max = " + str(self.maxTemp))
        target_value_func = predicted_new_q_value - alpha * joint_entropy
        value_loss = self.value_criterion(predicted_value,
                                          target_value_func.detach())
        print("VALUE LOSS = " + str(value_loss))
        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()
        # Training Policy Function
        #policy_loss = (log_prob - predicted_new_q_value).mean()
        policy_loss = (alpha * joint_entropy - predicted_new_q_value).mean()
        print("POLICY LOSS = " + str(policy_loss))
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)
        print("DONE WITH NETWORK UPDATES")
예제 #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    #parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument('--tau', type=float, help='soft target update parameter', default=0.01)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)

    parser.add_argument("--mode", type=str, default='none')
    args = parser.parse_args()
    assert args.mode in ['none', 'test', 'transfer']

    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound_high = env.action_space.high
    args.action_bound_low = env.action_space.low

    assert len(args.action_bound_high) == len(args.action_bound_low)
    for i in range(len(args.action_bound_high)):
        assert args.action_bound_high[i] == -args.action_bound_low[i]
    print(args)

    # Networks
    ddpg = actorcritic(state_shape=[None, args.state_dim],
                       action_shape=[None, args.action_dim],
                       output_bound_low=args.action_bound_low,
                       output_bound_high=args.action_bound_high,
                       learning_rate=args.learning_rate,
                       tau=args.tau)

    # Allocate the Gaussian process
    model_been_trained = False
    smodel = gp_model([None, args.state_dim], [None, args.action_dim], [None, args.state_dim], epochs=100)
    rmodel = gp_model([None, args.state_dim], [None, args.action_dim], [None, 1], epochs=100)
    Bold = Memory(500)
    B = Memory(500)
    ell = 1#Unroll depth
    I = 5#Number of updates per timestep
    memory_fictional = Memory(args.replay_mem_size)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    exploration_strategy = OUStrategy(ddpg, env)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ddpg.copy_target(sess)

        if args.mode in ['test', 'transfer']:
            env.seed(1)
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            #env.render()
            # Choose an action
            exploration = (float(args.time_steps - time_steps) / float(args.time_steps)) ** 4
            action = exploration_strategy.action(sess, state[np.newaxis, ...], exploration)
            # Execute action
            state1, reward, done, _ = env.step(action)
            total_rewards += float(reward)
            # Store tuple in replay memory
            memory.add([state[np.newaxis, ...], action[np.newaxis, ...], reward, state1[np.newaxis, ...], done])
            B.add([state[np.newaxis, ...], action[np.newaxis, ...], reward, state1[np.newaxis, ...], done])

            if time_steps % args.batch_size == 0 and time_steps != 0 and model_been_trained and ell > 0:
            #if time_steps >= 3 and model_been_trained:
                batch = np.array(memory.sample(args.batch_size))
                assert len(batch) > 0
                next_states = np.concatenate([ele[3] for ele in batch], axis=0)

                for _ in range(ell):
                    states = np.copy(next_states)
                    actions = np.random.uniform(low=args.action_bound_low,
                                                       high=args.action_bound_high,
                                                       size=[states.shape[0], args.action_dim])
                    rewards = rmodel.predict(sess, states, actions)
                    next_states = smodel.predict(sess, states, actions)

                    for state, action, reward, next_state in zip(list(states), list(actions), list(rewards), list(next_states)):
                        memory_fictional.add([state[np.newaxis, ...], action[np.newaxis, ...], reward, next_state[np.newaxis, ...], False])

            for _ in range(I):
                # Training step
                batch = np.array(memory.sample(args.batch_size))
                assert len(batch) > 0
                states = np.concatenate(batch[:, 0], axis=0)
                actions = np.concatenate(batch[:, 1], axis=0)
                rewards = batch[:, 2]
                states1 = np.concatenate(batch[:, 3], axis=0)
                dones = batch[:, 4]
                ddpg.train(sess, states, actions, rewards, states1, dones)
                ddpg.update_target(sess)


                for _ in range(ell):
                    # Training step for fictional experience
                    batch = np.array(memory_fictional.sample(args.batch_size))
                    if len(batch) > 0:
                        states = np.concatenate(batch[:, 0], axis=0)
                        actions = np.concatenate(batch[:, 1], axis=0)
                        rewards = batch[:, 2]
                        states1 = np.concatenate(batch[:, 3], axis=0)
                        dones = batch[:, 4]
                        ddpg.train(sess, states, actions, rewards, states1, dones)
                        ddpg.update_target(sess)


            if len(B.mem) == B.max_size and ell > 0:
                import copy
                Bold = copy.deepcopy(B)
                B.mem = []
                states = np.concatenate([ele[0] for ele in Bold.mem], axis=0)
                actions = np.concatenate([ele[1] for ele in Bold.mem], axis=0)
                rewards = np.array([ele[2] for ele in Bold.mem])
                next_states = np.concatenate([ele[3] for ele in Bold.mem], axis=0)

                rmodel.train(sess, states, actions, rewards[..., np.newaxis])
                smodel.train(sess, states, actions, next_states)
                model_been_trained = True

            state = np.copy(state1)
            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards
                epoch += 1
                total_rewards = 0.
                if args.mode == 'transfer':
                    if time_steps >= args.time_steps / 3:
                        env.seed(0)
                    else:
                        env.seed(1)
                elif args.mode == 'test':
                    env.seed(1)
                state = env.reset()

            if args.mode == 'transfer':
                if time_steps == args.time_steps / 3:
                    memory = Memory(args.replay_mem_size)
예제 #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=30000)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    parser.add_argument("--model", type=str, default='gan')

    args = parser.parse_args()

    assert args.model in ['gan', 'gated', 'gated_reg']
    env = gym.make(args.environment)
    args.action_size = env.action_space.n
    args.input_shape = [None, env.observation_space.shape[0]]
    print args

    # Other parameters
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the model
    jqnet, update_ops = init_model(args.input_shape, args.action_size,
                                   args.latent_size, args.learning_rate,
                                   args.model)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(args.epochs):
            total_reward = 0
            observation = env.reset()
            for t in range(1000000):
                #env.render()
                action = jqnet.get_action(sess, observation)
                if np.random.rand() < epsilon:
                    action = env.action_space.sample()
                observation1, reward, done, info = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([observation, action, reward, observation1, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                assert len(batch) > 0
                states = np.vstack(batch[:, 0])
                actions = np.array(batch[:, 1])
                rewards = batch[:, 2]
                states1 = np.vstack(batch[:, 3])
                dones = batch[:, 4].astype(np.float32)

                #Get another batch
                batch2 = np.array(memory.sample(args.batch_size))
                assert len(batch2) > 0
                states2 = np.vstack(batch2[:, 0])
                actions2 = np.array(batch2[:, 1])

                # Update Q
                jqnet.updateQ(sess, states, actions, rewards, states1, dones,
                              states2, actions2, len(batch), args.latent_size)

                # Update state model
                jqnet.updateS(sess, states, actions, states1, states2,
                              actions2, len(batch), args.latent_size)

                # Update reward model
                jqnet.updateR(sess, states, actions, rewards, states2,
                              actions2, len(batch), args.latent_size)

                # Set observation
                observation = observation1

                # Update?
                if int(time_step) % args.target_update_freq == 0:
                    #print "Updating target..."
                    sess.run(update_ops)

                if done:
                    print "Episode finished after {} timesteps".format(
                        t + 1), 'epoch', epoch, 'total_reward', total_reward
                    break

    env.close()
    gym.upload('/tmp/cartpole-experiment-' + str(rand_no),
               api_key='sk_AlBXbTIgR4yaxPlvDpm61g')
예제 #13
0
                    else:
                        # Stack the frame of the next_state
                        next_state, stacked_frames = stack_frames(
                            stacked_frames, next_state, False)

                        # Add experience to memory
                        experience = state, action, reward, next_state, done
                        memory.store(experience)

                        # st+1 is now our current state
                        state = next_state

                    ### LEARNING PART
                    # Obtain random mini-batch from memory
                    tree_idx, batch, ISWeights_mb = memory.sample(batch_size)
                    # batch = memory.sample(batch_size)

                    states_mb = np.array([each[0][0] for each in batch],
                                         ndmin=3)
                    # print_var("states_mb", states_mb.shape)
                    actions_mb = np.array([each[0][1] for each in batch])
                    # print_var("actions_mb", actions_mb.shape)
                    # print_var("actions_mb", actions_mb)

                    rewards_mb = np.array([each[0][2] for each in batch])
                    # print_var("rewards_mb", rewards_mb.shape)
                    # print_var("rewards_mb", rewards_mb)
                    next_states_mb = np.array([each[0][3] for each in batch],
                                              ndmin=3)
                    # print_var("next_states_mb", next_states_mb.shape)
class Agent:
    def __init__(self, level_name):  
        self.level_name = level_name  
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()
        
        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)
        
        # instantiate memory
        self.memory = Memory(max_size=memory_size)
        
        # initialize deque with zero images
        self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4)

        for i in range(pretrain_length):    
            # If it's the first step
            if i == 0:
                state = self.env.reset()        
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state
       
        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)
        
        self.write_op = tf.summary.merge_all()
    
    def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions):
        # first we randomize a number
        exp_exp_tradeoff = np.random.rand()

        explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)

        if explore_probability > exp_exp_tradeoff:
            # make a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
        else:
            # estimate the Qs values state
            Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state.reshape((1, *state.shape))})

            # take the biggest Q value (= best action)
            choice = np.argmax(Qs)
            action = self.possible_actions[choice]

        return action, choice, explore_probability
    
    def play_notebook(self):
        import matplotlib.pyplot as plt
        # imports to render env to gif
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html
        def display_frames_as_gif(frames):
            """
            Displays a list of frames as a gif, with controls
            """
            #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network 
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode = 'rgb_array'))

                    total_rewards += reward

                    if done:
                        print ("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break


                    next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)
                    state = next_state

            self.env.close()

        display_frames_as_gif(frames)
        
    def play(self):
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            #self.env = wrap_env(self.env)

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network 
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print ("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()
    
    def train(self):        
        with tf.Session() as sess:
            # initialize the variables
            sess.run(tf.global_variables_initializer())

            # initialize decay rate (that will be used to reduce epsilon)
            decay_step = 0

            for episode in range(total_episodes):
                # set step to 0
                step = 0

                # initialize rewards of episode
                episode_rewards = []

                # make a new episode and opserve the first state
                state = self.env.reset()

                # remember that stack frame function
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("Episode:", episode)

                while step < max_steps:
                    step += 1
                    #print("step:", step)

                    # increase decay_step
                    decay_step += 1

                    # predict an action
                    action, choice, explore_probability = self.predict_action(sess,
                                                         explore_start, 
                                                         explore_stop, 
                                                         decay_rate, 
                                                         decay_step, 
                                                         state, 
                                                         self.possible_actions)

                    # perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    # add the reward to total reward
                    episode_rewards.append(reward)

                    # the game is finished
                    if done:
                        print("done")
                        # the episode ends so no next state
                        next_state = np.zeros((110, 84), dtype=np.int)

                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # set step = max_steps to end episode
                        step = max_steps

                        # get total reward of the episode
                        total_reward = np.sum(episode_rewards)

                        print("Episode:", episode, 
                              "Total reward:", total_reward, 
                              "Explore P:", explore_probability, 
                              "Training Loss:", loss)

                        #rewards_list.append((episode, total_reward))

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))
                    else:
                        # stack frame of the next state
                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))

                        # s_{i} := s_{i+1}
                        state = next_state

                    ### Learning part
                    # obtain random mini-batch from memory
                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    # get Q values for next_state
                    Qs_next_state = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb})

                    # set Q_target = r if episode ends with s+1
                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    # if we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer],
                                      feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})

                    # write tf summaries
                    summary = sess.run(self.write_op, feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})
                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                # save model every 5 episodes
                if episode % 5 == 0:
                    self.saver.save(sess, "models/{0}.cpkt".format(self.level_name))
                    print("Model Saved")
예제 #15
0
def main2():
    import gym
    import copy
    from utils import Memory
    from utils import process_frame2

    env = gym.make('BreakoutDeterministic-v4')
    gc = gated_convolution2(shape=[None, 84, 84, 4],
                            nummap=128,
                            numfactors=128,
                            learning_rate=.001,
                            w=8,
                            s=1,
                            a_size=env.action_space.n)
    mem = Memory(50000)
    batch_size = 4
    steps = 1
    length = 4
    action_space = env.action_space.n

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        while True:
            s = env.reset()
            s = process_frame2(s)

            state = [s[..., np.newaxis]] * length
            state_ = [s[..., np.newaxis]] * length
            action = [-1] * length

            done = False
            while done == False:
                #env.render()

                a = np.random.randint(env.action_space.n)
                s_, r, done, _ = env.step(a)
                s_ = process_frame2(s_)

                state_.pop(0)
                action.pop(0)

                state_.append(s_[..., np.newaxis])
                action.append(a)

                mem.add([
                    np.concatenate(state, axis=-1)[np.newaxis, ...],
                    np.array(action)[np.newaxis, ...],
                    np.concatenate(state_, axis=-1)[np.newaxis, ...]
                ])

                if len(mem.mem) >= batch_size:
                    batch = mem.sample(batch_size)
                    #Do stuff
                    states = []
                    actions = []
                    states_ = []
                    for i in range(len(batch)):
                        states.append(batch[i][0])
                        actions.append(batch[i][1])
                        states_.append(batch[i][2])
                    states = np.concatenate(states, axis=0).astype(
                        np.float64) / 255.
                    actions = np.concatenate(actions, axis=0)
                    states_ = np.concatenate(states_, axis=0).astype(
                        np.float64) / 255.

                    _, recon_loss, recon_x, recon_y, recon_action_loss = gc.run2(
                        sess, states, actions, states_)
                    print 'steps:', steps, 'recon_loss:', recon_loss, 'recon_action_loss', recon_action_loss, 'main2'

                steps += 1
                if done == True:
                    break
예제 #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=300)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    args = parser.parse_args()

    env = gym.make(args.environment)
    args.action_size = env.action_space.n
    args.input_shape = [None] + list(env.observation_space.shape)

    print args

    # Epsilon parameter
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the agent
    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    tnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='tnet')
    update_ops = update_target_graph('qnet', 'tnet')

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(args.epochs):
            total_reward = 0
            state = env.reset()
            while True:
                #env.render()
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    action = qnet.act(sess, state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([state, action, reward, next_state, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.learning_rate, tnet)

                # s <- s'
                state = np.copy(next_state)

                # Update target network
                if int(time_step) % args.target_update_freq == 0:
                    sess.run(update_ops)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
예제 #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    parser.add_argument("--input-shape", type=list, default=[None, 1])
    parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument('--tau',
                        help='soft target update parameter',
                        default=0.001)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--gamma", type=float, default=.99)

    parser.add_argument("--K",
                        type=int,
                        default=1,
                        help='The number of steps to train the environment')
    parser.add_argument(
        "--L",
        type=int,
        default=1,
        help='The number of Q-learning steps for hypothetical rollouts')
    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    args = parser.parse_args()

    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.input_shape = [None, args.state_dim]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound = env.action_space.high
    print(args)

    # Networks
    actor_source = actor(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        output_bound=args.action_bound[0],\
        scope='actor_source')
    critic_source = critic(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        scope='critic_source')
    actor_target = actor(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        output_bound=args.action_bound[0],\
        scope='actor_target')
    critic_target = critic(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        scope='critic_target')

    # Initialize the GANs
    cgan_state = CGAN(input_shape=args.input_shape,\
        action_size=args.action_dim,\
        latent_size=args.latent_size,\
        gen_input_shape=args.input_shape,\
        continuous_action=True)
    cgan_reward = CGAN(input_shape=args.input_shape,\
        action_size=args.action_dim,\
        latent_size=args.latent_size,\
        gen_input_shape=[None, 1],\
        continuous_action=True)

    # Update and copy operators
    update_target_actor = update_target_graph2('actor_source', 'actor_target',
                                               args.tau)
    update_target_critic = update_target_graph2('critic_source',
                                                'critic_target', args.tau)

    copy_target_actor = update_target_graph2('actor_source', 'actor_target',
                                             1.)
    copy_target_critic = update_target_graph2('critic_source', 'critic_target',
                                              1.)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(args.action_dim))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(copy_target_critic)
        sess.run(copy_target_actor)

        for epoch in range(args.epochs):
            state = env.reset()
            total_rewards = 0.0
            while True:
                #env.render()
                # Choose an action
                action = sess.run(
                    actor_source.action,
                    feed_dict={actor_source.states: state[np.newaxis, ...]
                               })[0] + actor_noise()
                # Execute action
                state1, reward, done, _ = env.step(action)
                total_rewards += float(reward)
                # Store tuple in replay memory
                memory.add([state[np.newaxis, ...],\
                    action[np.newaxis, ...],\
                    reward,\
                    state1[np.newaxis, ...],\
                    done])

                # Training step: update actor critic using real experience
                batch = np.array(memory.sample(args.batch_size))
                assert len(batch) > 0
                states = np.concatenate(batch[:, 0], axis=0)
                actions = np.concatenate(batch[:, 1], axis=0)
                rewards = batch[:, 2]
                states1 = np.concatenate(batch[:, 3], axis=0)
                dones = batch[:, 4]

                # Update the critic
                actions1 = sess.run(actor_target.action,\
                    feed_dict={actor_target.states:states1})
                targetQ = np.squeeze(sess.run(critic_target.Q,\
                    feed_dict={critic_target.states:states1,\
                        critic_target.actions:actions1}), axis=-1)
                targetQ = rewards + (
                    1. - dones.astype(np.float32)) * args.gamma * targetQ
                targetQ = targetQ[..., np.newaxis]
                _, critic_loss = sess.run([critic_source.critic_solver,\
                    critic_source.loss],\
                    feed_dict={critic_source.states:states,\
                        critic_source.actions:actions,\
                        critic_source.targetQ:targetQ})

                # Update the actor
                critic_grads = sess.run(critic_source.grads,\
                    feed_dict={critic_source.states:states,\
                        critic_source.actions:actions})[0]# Grab gradients from critic
                _ = sess.run(actor_source.opt,\
                    feed_dict={actor_source.states:states,\
                        actor_source.dQ_by_da:critic_grads})

                # Update target networks
                sess.run(update_target_critic)
                sess.run(update_target_actor)

                # Training step: update the environment model using real experience (i.e., update the conditional GANs)
                for k in range(args.K):
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.concatenate(batch[:, 0], axis=0)
                    actions = np.concatenate(batch[:, 1], axis=0)
                    rewards = batch[:, 2]
                    states1 = np.concatenate(batch[:, 3], axis=0)

                    _, D_loss_state = sess.run([cgan_state.D_solver, cgan_state.D_loss],\
                        feed_dict={cgan_state.states:states,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size),\
                            cgan_state.X:states1})

                    _, G_loss_state = sess.run([cgan_state.G_solver,\
                        cgan_state.G_loss],\
                        feed_dict={cgan_state.states:states,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size)})

                    _, D_loss_reward = sess.run([cgan_reward.D_solver,\
                        cgan_reward.D_loss],\
                        feed_dict={cgan_reward.states:states,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size),\
                            cgan_reward.X:rewards[..., np.newaxis]})

                    _, G_loss_reward = sess.run([cgan_reward.G_solver,\
                        cgan_reward.G_loss],\
                        feed_dict={cgan_reward.states:states,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size)})
                    #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state

                # Training step: update actor critic using imagination rollouts
                for l in range(args.L):
                    batch = np.array(memory.sample(args.batch_size))
                    states_ = np.concatenate(batch[:, 3], axis=0)
                    actions = np.random.uniform(env.action_space.low[0],\
                        env.action_space.high[0],\
                        size=(len(batch),\
                        env.action_space.shape[0]))
                    dones = np.array([False] * len(batch))

                    G_sample_state = sess.run(cgan_state.G_sample,\
                        feed_dict={cgan_state.states:states_,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size)})
                    G_sample_reward = sess.run(cgan_reward.G_sample,\
                        feed_dict={cgan_reward.states:states_,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size)})
                    G_sample_reward = np.squeeze(G_sample_reward, axis=-1)

                    # Update the critic
                    actions1 = sess.run(actor_target.action,\
                        feed_dict={actor_target.states:G_sample_state})
                    targetQ = np.squeeze(sess.run(critic_target.Q,\
                        feed_dict={critic_target.states:G_sample_state,\
                            critic_target.actions:actions1}), axis=-1)
                    targetQ = G_sample_reward + (
                        1. - dones.astype(np.float32)) * args.gamma * targetQ
                    targetQ = targetQ[..., np.newaxis]
                    _, critic_loss = sess.run([critic_source.critic_solver,\
                        critic_source.loss],\
                        feed_dict={critic_source.states:states_,\
                            critic_source.actions:actions,\
                            critic_source.targetQ:targetQ})

                    # Update the actor
                    critic_grads = sess.run(critic_source.grads,\
                        feed_dict={critic_source.states:states_,\
                            critic_source.actions:actions})[0]# Grab gradients from critic
                    _ = sess.run(actor_source.opt,\
                        feed_dict={actor_source.states:states_,\
                            actor_source.dQ_by_da:critic_grads})

                    # Update target networks
                    sess.run(update_target_critic)
                    sess.run(update_target_actor)

                state = np.copy(state1)
                if done == True:
                    print 'epoch', epoch, 'total rewards', total_rewards
                    break
예제 #18
0
def main():
    import gym
    import sys
    import copy
    sys.path.append('../..')
    from utils import Memory

    #env = gym.make('LunarLander-v2')
    env = gym.make('Pendulum-v0')
    #env = gym.make('CartPole-v0')
    mem = Memory(1000000)
    batch_size = 32
    try:
        a_size = env.action_space.n
        a_type = 'discrete'
    except:
        try:
            a_size = env.action_space.shape[0]
            a_type = 'continuous'
        except:
            raise ValueError('Cannot find action size.')
    emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]],
                            a_size=a_size,
                            out_shape=[None, env.observation_space.shape[0]],
                            a_type=a_type,
                            numfactors=256)
    #emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]], a_size=a_size, out_shape=[None, 1], a_type=a_type, numfactors=256)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        while True:
            s = env.reset()

            done = False
            while done == False:
                #env.render()
                #a = np.random.randint(a_size)
                a = random_action(a_size, a_type)
                s_, r, done, _ = env.step(a)

                mem.add([s, a, r, s_, done])
                batch = mem.sample(batch_size)
                if len(batch) == batch_size:
                    states = []
                    actions = []
                    rewards = []
                    states_ = []
                    for i in range(batch_size):
                        states.append(batch[i][0])
                        actions.append(batch[i][1])
                        rewards.append(batch[i][2])
                        states_.append(batch[i][3])

                    states = np.stack(states, axis=0)
                    actions = np.stack(actions, axis=0)
                    rewards = np.stack(rewards, axis=0)
                    states_ = np.stack(states_, axis=0)

                    #_, loss_s, loss_a, loss_s_, loss = sess.run([emg.update_model, emg.loss_s, emg.loss_a, emg.loss_s_, emg.loss], feed_dict={emg.states:states, emg.states_:rewards[..., np.newaxis], emg.actions_placeholder:actions})
                    _, loss_s, loss_a, loss_s_, loss = sess.run(
                        [
                            emg.update_model, emg.loss_s, emg.loss_a,
                            emg.loss_s_, emg.loss
                        ],
                        feed_dict={
                            emg.states: states,
                            emg.states_: states_,
                            emg.actions_placeholder: actions
                        })
                    print 'loss_s', loss_s, 'loss_a', loss_a, 'loss_s_', loss_s_, 'loss', loss

                s = copy.deepcopy(s_)
                if done == True:
                    break
예제 #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--no-samples", type=int, default=50)
    parser.add_argument("--unroll-steps", type=int, default=20)
    parser.add_argument("--replay-mem-size", type=int, default=200)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--pretrain-epochs", type=int, default=100)
    args = parser.parse_args()

    print args

    env = gym.make('Pendulum-v0')

    # Initialize the agent
    psb = policy_search_bayesian(
        state_dim=env.observation_space.shape[0],
        action_dim=env.action_space.shape[0],
        observation_space_low=env.observation_space.low,
        observation_space_high=env.observation_space.high,
        no_basis=(6**4) + 1,
        action_bound_low=env.action_space.low,
        action_bound_high=env.action_space.high,
        unroll_steps=args.unroll_steps,
        no_samples=args.no_samples,
        discount_factor=.9)

    # Initialize the memory
    memory = Memory(args.replay_mem_size)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #psb.pretrain(sess, args.pretrain_epochs)
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        #batch = []
        for time_steps in range(30000):
            #env.render()
            # Get action and step in environment
            action = psb.act(sess, state, epoch)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Append to the batch
            memory.add([
                np.atleast_2d(state),
                np.atleast_2d(action), reward,
                np.atleast_2d(next_state), done
            ])

            #batch.append([state, action, reward, next_state, done])

            # Training step
            batch = memory.sample(args.batch_size)
            states = np.concatenate([b[0] for b in batch], axis=0)
            #psb.train2(sess, states)
            psb.train_policy(sess, states, epoch)

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards
                epoch += 1
                total_rewards = 0.
                '''
                B = batch
                states = np.stack([b[0] for b in B], axis=0)
                actions = np.stack([b[1] for b in B], axis=0)
                rewards = np.array([b[2] for b in B])
                next_states = np.stack([b[3] for b in B], axis=0)
                dones = np.array([float(b[4]) for b in B])
                psb.train_dynamics(sess, states, actions, next_states)
                psb.visualize_trajectories2(sess)
                psb.visualize_trajectories(sess)
                #psb.train_policy(sess, states, epoch)

                batch = []
                '''
                state = env.reset()