Exemplo n.º 1
0
def nature_cnn(sess, env, stochastic=False, virtual_bn=False):
    """
    Create a CNN policy for a game environment.
    """
    if not virtual_bn:
        return CNN(sess, gym_space_distribution(env.action_space),
                   gym_space_vectorizer(env.observation_space), stochastic)
    return NormalizedCNN(sess, gym_space_distribution(env.action_space),
                         gym_space_vectorizer(env.observation_space),
                         stochastic, env)
Exemplo n.º 2
0
 def make_net(name):
     return MLPQNetwork(sess,
                        env.action_space.n,
                        gym_space_vectorizer(
                            env.observation_space),
                        name,
                        layer_sizes=[32])
Exemplo n.º 3
0
def main():
    """Run DQN until the environment throws an exception."""
    base_path = "results/rainbow/6/"
    env = make_env(stack=False, scale_rew=False, render=None, monitor=base_path + "train_monitor",
                   episodic_life=True)
    # I think the env itself allows Backtracking
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.8

    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n, gym_space_vectorizer(env.observation_space),
                                  min_val=-200, max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        saver = tf.train.Saver(name="rainbow")
        sess.run(tf.global_variables_initializer())
        saver.save(sess, base_path + "training", global_step=0)
        try:
            dqn.train(num_steps=2_000_000,  # Make sure an exception arrives before we stop.
                      player=player,
                      replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
                      optimize_op=optimize,
                      train_interval=1,
                      target_interval=8192,
                      batch_size=64,
                      min_buffer_size=20000,
                      handle_ep=handle_ep)  # in seconds
        except KeyboardInterrupt:
            print("keyboard interrupt")
        print("finishing")
        saver.save(sess, base_path + "final", global_step=2_000_000)
Exemplo n.º 4
0
def learn_cartpole():
    """Train an agent."""
    env = gym.make('CartPole-v0')
    try:
        agent = ActorCritic(gym_space_distribution(env.action_space),
                            gym_space_vectorizer(env.observation_space))
        with tf.Session() as sess:
            a2c = A2C(sess, agent, target_kl=0.03)
            roller = BasicRoller(env, agent, min_episodes=8, min_steps=1024)
            while True:
                with agent.frozen():
                    rollouts = roller.rollouts()
                print('mean=%f' % (mean_total_reward(rollouts), ))
                agent.actor.extend(
                    a2c.policy_update(rollouts,
                                      STEP_SIZE,
                                      NUM_STEPS,
                                      min_leaf=30))
                agent.critic.extend(
                    a2c.value_update(rollouts,
                                     VAL_STEP,
                                     NUM_STEPS,
                                     min_leaf=30))
    finally:
        env.close()
Exemplo n.º 5
0
def main():
    with tf.Session() as sess:
        print('Creating environment...')
        env = TFBatchedEnv(sess, Pong(), 1)
        env = BatchedFrameStack(env)

        print('Creating model...')
        model = CNN(sess,
                    gym_space_distribution(env.action_space),
                    gym_space_vectorizer(env.observation_space))

        print('Creating roller...')
        roller = TruncatedRoller(env, model, 1)

        print('Initializing variables...')
        sess.run(tf.global_variables_initializer())

        if os.path.exists('params.pkl'):
            print('Loading parameters...')
            with open('params.pkl', 'rb') as in_file:
                params = pickle.load(in_file)
            for var, val in zip(tf.trainable_variables(), params):
                sess.run(tf.assign(var, val))
        else:
            print('Warning: parameter file does not exist!')

        print('Running agent...')
        viewer = SimpleImageViewer()
        while True:
            for obs in roller.rollouts()[0].step_observations:
                viewer.imshow(obs[..., -3:])
Exemplo n.º 6
0
def main():
    """Run DQN until the environment throws an exception."""
    env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1')
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
Exemplo n.º 7
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=StochasticMaxStochasticDeltaDeletionPRB(500000,
                                                                  0.5,
                                                                  0.4,
                                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
Exemplo n.º 8
0
def main():
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-421,
                                  max_val=421))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(num_steps=2000000,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0.5,
                                                        0.4,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=64,
                  batch_size=32,
                  min_buffer_size=25000)
Exemplo n.º 9
0
def main():
    """Run DQN until the environment throws an exception."""
    env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1')
    env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop.
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=8192,
                  batch_size=32,
                  min_buffer_size=20000)

        print(tf.trainable_variables())
        save_path='/home/noob/retro-noob/rainbow/params/params'
        utils.save_state(save_path+'_tf_saver')

        with tf.variable_scope('model'):
            params = tf.trainable_variables()

        ps = sess.run(params)
        joblib.dump(ps, save_path + '_joblib')
Exemplo n.º 10
0
def run_ppo():
    """
    Run a training worker.
    """
    env = gym.make('CartPole-v0')
    action_dist = gym_space_distribution(env.action_space)
    obs_vectorizer = gym_space_vectorizer(env.observation_space)

    with tf.Session() as sess:
        model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32])

        # Deal with CartPole-v0 reward scale.
        model.scale_outputs(20)

        roller = BasicRoller(env, model, min_episodes=30)
        ppo = PPO(model)
        optimizer = MPIOptimizer(tf.train.AdamOptimizer(learning_rate=1e-3),
                                 -ppo.objective)

        sess.run(tf.global_variables_initializer())
        optimizer.sync_from_root(sess)
        for i in range(50):
            rollouts = roller.rollouts()
            # pylint: disable=E1101
            print('batch %d: rank=%d mean=%f' %
                  (i, MPI.COMM_WORLD.Get_rank(), mean_total_reward(rollouts)))
            mpi_ppo(ppo, optimizer, rollouts, log_fn=print)
Exemplo n.º 11
0
def simple_mlp(sess, env, stochastic=False):
    """
    Create a simple MLP policy for the environment.
    """
    return MLP(sess, gym_space_distribution(env.action_space),
               gym_space_vectorizer(env.observation_space), stochastic,
               (32, 32))
Exemplo n.º 12
0
def main():
    """
    Entry-point for the program.
    """
    env = gym.make('CartPole-v0')

    with tf.Session() as sess:
        make_net = lambda name: MLPQNetwork(sess,
                                            env.action_space.n,
                                            gym_space_vectorizer(
                                                env.observation_space),
                                            name,
                                            layer_sizes=[32])
        dqn = DQN(make_net('online'), make_net('target'))
        player = BasicPlayer(env,
                             EpsGreedyQNetwork(dqn.online_net, EPSILON),
                             batch_size=STEPS_PER_UPDATE)
        optimize = dqn.optimize(learning_rate=LEARNING_RATE)

        sess.run(tf.global_variables_initializer())

        dqn.train(num_steps=30000,
                  player=player,
                  replay_buffer=UniformReplayBuffer(BUFFER_SIZE),
                  optimize_op=optimize,
                  target_interval=200,
                  batch_size=64,
                  min_buffer_size=200,
                  handle_ep=lambda _, rew: print('got reward: ' + str(rew)))

    env.close()
Exemplo n.º 13
0
def create_model(args, sess):
    act_space = gym.spaces.MultiBinary(args.act_size)
    obs_space = gym.spaces.Box(low=0,
                               high=0xff,
                               shape=[args.obs_size] * 2 + [3],
                               dtype='uint8')
    return CNN(sess, gym_space_distribution(act_space),
               gym_space_vectorizer(obs_space))
Exemplo n.º 14
0
def learn_setup(env_id=None,
                timesteps=int(5e6),
                env_name=None,
                param_scale=1,
                name="test",
                expnum=0,
                env=None,
                n_episodes=None,
                n_steps_per_episode=None,
                reward_threshold=0,
                CMA_mu=None,
                CMA_cmean=None,
                CMA_rankmu=None,
                CMA_rankone=None,
                log_file=None):

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    if env_id is None:
        env_id = env_name
    if env is None:
        env = make_vec_env(env_id,
                           "mujoco",
                           1,
                           None,
                           reward_scale=1.0,
                           flatten_dict_observations=True)

    if log_file is None:
        log_file = os.path.join(
            'results', "recent" + name + "_" + str(expnum) + ".monitor.csv")
        log_npy = os.path.join('results',
                               "recent" + name + '_' + str(expnum) + '.npy')
    #env = LoggedEnv(env, log_file, log_npy)

    model = ContinuousMLP(sess, env.action_space,
                          gym_space_vectorizer(env.observation_space))
    roller = BasicRoller(env,
                         model,
                         min_episodes=1,
                         min_steps=n_steps_per_episode)
    sess.run(tf.global_variables_initializer())
    trainer = CMATrainer(sess,
                         scale=param_scale,
                         CMA_mu=CMA_mu,
                         CMA_cmean=CMA_cmean,
                         CMA_rankmu=CMA_rankmu,
                         CMA_rankone=CMA_rankone)  #, popsize=n_episodes)
    rewards = []
    local_variables = {
        'roller': roller,
        'trainer': trainer,
        'env_id': env_name,
        'reward_threshold': reward_threshold,
        'rewards': rewards
    }
    return local_variables
Exemplo n.º 15
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        """
        Create a TF Op that optimizes the objective.
        Args:
          learning_rate: the Adam learning rate.
          epsilon: the Adam epsilon.
        """
        optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4)

        sess.run(tf.global_variables_initializer())
        """
        Run an automated training loop.
        This is meant to provide a convenient way to run a
        standard training loop without any modifications.
        You may get more flexibility by writing your own
        training loop.
        Args:
          num_steps: the number of timesteps to run.
          player: the Player for gathering experience.
          replay_buffer: the ReplayBuffer for experience.
          optimize_op: a TF Op to optimize the model.
          train_interval: timesteps per training step.
          target_interval: number of timesteps between
            target network updates.
          batch_size: the size of experience mini-batches.
          min_buffer_size: minimum replay buffer size
            before training is performed.
          tf_schedules: a sequence of TFSchedules that are
            updated with the number of steps taken.
          handle_ep: called with information about every
            completed episode.
          timeout: if set, this is a number of seconds
            after which the training loop should exit.
        """
        dqn.train(
            num_steps=1000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
Exemplo n.º 16
0
 def build_network(self, sess, name):
     layer_sizes = [self.args['layer_1_size'], self.args['layer_2_size']]
     if self.args['has_third_layer']:
         layer_sizes.append(
             geom_mean(self.args['layer_2_size'], self.env.action_space.n))
     return MLPQNetwork(sess,
                        self.env.action_space.n,
                        gym_space_vectorizer(self.env.observation_space),
                        name,
                        layer_sizes=layer_sizes)
Exemplo n.º 17
0
def main():
    """Run DQN until the environment throws an exception."""
    # "results/rainbow/2/videos/6"
    save_dir = "results/rainbow/7/val_monitor/2"
    env = make_env(stack=False,
                   scale_rew=False,
                   render=60,
                   monitor=save_dir,
                   timelimit=False,
                   episodic_life=False,
                   single_life=True,
                   video=lambda id: True)
    # env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()

    with tf.Session(config=config) as sess:
        saver = tf.train.import_meta_graph(
            "results/rainbow/7/final-4000000.meta", clear_devices=True)
        # saver.restore(sess, tf.train.latest_checkpoint('results/rainbow/2'))
        saver.restore(sess, 'results/rainbow/7/final-4000000')
        model = LoadedNetwork(sess,
                              gym_space_vectorizer(env.observation_space))
        # rebuild the online_net form the saved model
        # type <anyrl.models.dqn_dist.NatureDistQNetwork object at ???>
        player = NStepPlayer(BatchedPlayer(env, model), 3)

        with tf.device("/cpu"):
            # sess.run(tf.global_variables_initializer())
            try:
                for episode_index in tqdm(range(40), unit="episode"):
                    axes = make_axes()
                    plotter = RewardPlotter(axes,
                                            save_period=40,
                                            render_period=600,
                                            max_entries=600)
                    for i in count():
                        trajectories = player.play()
                        end_of_episode = False
                        current_total_reward = None
                        for trajectory in trajectories:
                            current_total_reward = trajectory["total_reward"]
                            if trajectory["is_last"]:
                                end_of_episode = True
                        plotter.update(current_total_reward, step=i)
                        if end_of_episode:
                            # plt.show()
                            plotter.render()
                            plotter.save_file("{}/e{}.pdf".format(
                                save_dir, episode_index))
                            plotter.close()
                            break
            except KeyboardInterrupt:
                env.close()
                plt.close()
Exemplo n.º 18
0
 def __init__(self, *args, **kwargs):
     super(ACTest, self).__init__(*args, **kwargs)
     self.session = tf.Session()
     env = TupleCartPole()
     try:
         action_space = env.action_space
         observation_space = env.observation_space
     finally:
         env.close()
     self.action_dist = gym_space_distribution(action_space)
     self.obs_vectorizer = gym_space_vectorizer(observation_space)
Exemplo n.º 19
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

        # Other exploration schedules
        #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01)
        #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3)
        #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3)
        #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3)

        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        def _handle_ep(steps, rew, env_rewards):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)
            if total_steps % 10 == 0:
                print('%d episodes, %d steps: mean of last 100 episodes=%f' %
                      (len(reward_hist), total_steps,
                       sum(reward_hist[-100:]) / len(reward_hist[-100:])))

        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            tf_schedules=[eps_decay_sched],
            handle_ep=_handle_ep,
            restore_path='./pretrained_model',
            save_interval=None,
        )
Exemplo n.º 20
0
 def build_network(self, sess, name):
     return MLPDistQNetwork(sess,
                            self.env.action_space.n,
                            gym_space_vectorizer(
                                self.env.observation_space),
                            name,
                            51,
                            -10,
                            10,
                            layer_sizes=layer_sizes,
                            dueling=True,
                            dense=partial(noisy_net_dense,
                                          sigma0=self.args['sigma0']))
Exemplo n.º 21
0
def main():

    env_name = 'MineRLNavigateDense-v0'
    """Run DQN until the environment throws an exception."""
    base_env = [SimpleNavigateEnvWrapper(get_env(env_name)) for _ in range(1)]
    env = BatchedFrameStack(BatchedGymEnv([base_env]),
                            num_images=4,
                            concat=True)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        online, target = mine_rainbow_online_target(mine_cnn,
                                                    sess,
                                                    env.action_space.n,
                                                    gym_space_vectorizer(
                                                        env.observation_space),
                                                    min_val=-200,
                                                    max_val=200)
        dqn = DQN(online, target)
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        buffer_capacity = 5000

        replay_buffer = PrioritizedReplayBuffer(buffer_capacity,
                                                0.5,
                                                0.4,
                                                epsilon=0.1)

        iter = non_bugged_data_arr(env_name, num_trajs=100)
        expert_player = NStepPlayer(ImitationPlayer(iter, 200), 3)

        for traj in expert_player.play():
            replay_buffer.add_sample(traj, init_weight=1)

        print('starting training')
        dqn.train(num_steps=200,
                  player=player,
                  replay_buffer=replay_buffer,
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=8192,
                  batch_size=32,
                  min_buffer_size=20000)

        print('starting eval')
        player._cur_states = None
        score = evaluate(player)
        print(score)
Exemplo n.º 22
0
def run_ac_test(maker):
    """
    Run a test given a model constructor.
    """
    env = TupleCartPole()
    try:
        action_space = env.action_space
        observation_space = env.observation_space
    finally:
        env.close()
    action_dist = gym_space_distribution(action_space)
    obs_vectorizer = gym_space_vectorizer(observation_space)
    ModelTester(
        lambda sess: maker(sess, action_dist, obs_vectorizer)).test_all()
Exemplo n.º 23
0
def main():
    """Run DQN until the environment throws an exception."""
    env_fns, env_names = create_envs()
    env = BatchedFrameStack(batched_gym_env(env_fns),
                            num_images=4,
                            concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)  # Use ADAM
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        def _handle_ep(steps, rew, env_rewards):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)
            if total_steps % 1 == 0:
                print('%d episodes, %d steps: mean of last 100 episodes=%f' %
                      (len(reward_hist), total_steps,
                       sum(reward_hist[-100:]) / len(reward_hist[-100:])))

        dqn.train(
            num_steps=
            2000000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            handle_ep=_handle_ep,
            num_envs=len(env_fns),
            save_interval=10,
        )
Exemplo n.º 24
0
def main():
    """
    Entry-point for the program.
    """
    args = _parse_args()
    env = batched_gym_env([partial(make_single_env, args.game)] * args.workers)

    # Using BatchedFrameStack with concat=False is more
    # memory efficient than other stacking options.
    env = BatchedFrameStack(env, num_images=4, concat=False)

    with tf.Session() as sess:
        make_net = lambda name: NatureQNetwork(
            sess, env.action_space.n, gym_space_vectorizer(env.observation_space), name,
            dueling=True)
        dqn = DQN(make_net('online'), make_net('target'))
        player = BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, args.epsilon))
        optimize = dqn.optimize(learning_rate=args.lr)

        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0
        def _handle_ep(steps, rew):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)
            if len(reward_hist) == REWARD_HISTORY:
                print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist)))
                reward_hist.clear()

        dqn.train(num_steps=int(1e7),
                  player=player,
                  replay_buffer=UniformReplayBuffer(args.buffer_size),
                  optimize_op=optimize,
                  target_interval=args.target_interval,
                  batch_size=args.batch_size,
                  min_buffer_size=args.min_buffer_size,
                  handle_ep=_handle_ep)

    env.close()
Exemplo n.º 25
0
def main():
    """Run DQN until the environment throws an exception."""
    # "results/rainbow/2/videos/6"
    env = make_env(stack=False,
                   scale_rew=False,
                   render=20,
                   monitor=None,
                   timelimit=False)
    # env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    # TODO we might not want to allow backtracking, it kinda hurts in mario
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    config.gpu_options.per_process_gpu_memory_fraction = 0.6

    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        # TODO rebuild the online_net form the saved model
        # type <anyrl.models.dqn_dist.NatureDistQNetwork object at ???>
        # important methods
        #
        model = dqn.online_net
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

        with tf.device("/cpu"):
            # sess.run(tf.global_variables_initializer())

            vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            try:
                for i in tqdm(range(100000)):
                    trajectories = player.play()
                    for trajectori in trajectories:
                        trajectori
                        pass
            except KeyboardInterrupt:
                env.close()
Exemplo n.º 26
0
def learn_pong():
    """Train an agent."""
    env = batched_gym_env([make_single_env] * NUM_WORKERS)
    try:
        agent = ActorCritic(gym_space_distribution(env.action_space),
                            gym_space_vectorizer(env.observation_space))
        with tf.Session() as sess:
            a2c = A2C(sess, agent, target_kl=TARGET_KL)
            roller = TruncatedRoller(env, agent, HORIZON)
            total_steps = 0
            rewards = []
            print("Training... Don't expect progress for ~400K steps.")
            while True:
                with agent.frozen():
                    rollouts = roller.rollouts()
                for rollout in rollouts:
                    total_steps += rollout.num_steps
                    if not rollout.trunc_end:
                        rewards.append(rollout.total_reward)
                agent.actor.extend(
                    a2c.policy_update(rollouts,
                                      POLICY_STEP,
                                      NUM_STEPS,
                                      min_leaf=MIN_LEAF,
                                      feature_frac=FEATURE_FRAC))
                agent.critic.extend(
                    a2c.value_update(rollouts,
                                     VALUE_STEP,
                                     NUM_STEPS,
                                     min_leaf=MIN_LEAF,
                                     feature_frac=FEATURE_FRAC))
                if rewards:
                    print(
                        '%d steps: mean=%f' %
                        (total_steps, sum(rewards[-10:]) / len(rewards[-10:])))
                else:
                    print('%d steps: no episodes complete yet' % total_steps)
    finally:
        env.close()
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(sess, "/root/compo/model.ckpt")
        #print('model restored')
        replay_buffer = pickle.load(
            gzip.open('/root/compo/replay_buffer.p.gz', 'rb'))
        replay_buffer.alpha = 0.2
        replay_buffer.beta = 0.4
        replay_buffer.capacity = 100000

        restore_ppo2_weights(sess)

        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=
            replay_buffer,  #PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
            optimize_op=optimize,
            train_interval=4,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
Exemplo n.º 28
0
def main():
    with tf.Session() as sess:
        print('Creating environment...')
        env = TFBatchedEnv(sess, Pong(), 8)
        env = BatchedFrameStack(env)

        print('Creating model...')
        model = CNN(sess, gym_space_distribution(env.action_space),
                    gym_space_vectorizer(env.observation_space))

        print('Creating roller...')
        roller = TruncatedRoller(env, model, 128)

        print('Creating PPO graph...')
        ppo = PPO(model)
        optimize = ppo.optimize(learning_rate=3e-4)

        print('Initializing variables...')
        sess.run(tf.global_variables_initializer())

        print('Training agent...')
        for i in count():
            rollouts = roller.rollouts()
            for rollout in rollouts:
                if not rollout.trunc_end:
                    print('reward=%f steps=%d' %
                          (rollout.total_reward, rollout.total_steps))
            total_steps = sum(r.num_steps for r in rollouts)
            ppo.run_optimize(optimize,
                             rollouts,
                             batch_size=total_steps // 4,
                             num_iter=12,
                             log_fn=print)
            if i % 5 == 0:
                print('Saving...')
                parameters = sess.run(tf.trainable_variables())
                with open('params.pkl', 'wb+') as out_file:
                    pickle.dump(parameters, out_file)
Exemplo n.º 29
0
def run_algorithm(algo_name):
    """
    Run the specified training algorithm.
    """
    env = gym.make('CartPole-v0')
    action_dist = gym_space_distribution(env.action_space)
    obs_vectorizer = gym_space_vectorizer(env.observation_space)

    with tf.Session() as sess:
        model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32])

        # Deal with CartPole-v0 reward scale.
        model.scale_outputs(20)

        roller = BasicRoller(env, model, min_episodes=30)
        inner_loop = algorithm_inner_loop(algo_name, model)

        sess.run(tf.global_variables_initializer())
        print('running algorithm:', algo_name)
        for i in range(50):
            rollouts = roller.rollouts()
            print('batch %d: mean=%f' % (i, mean_total_reward(rollouts)))
            inner_loop(rollouts)
Exemplo n.º 30
0
def training_loop(env_id=None,
                  timesteps=int(5e6),
                  param_scale=1,
                  log_file=None):
    """
    Run CMA on the environment.
    """
    if log_file is None:
        log_file = os.path.join('results', env_id + '.monitor.csv')
    env = LoggedEnv(gym.make(env_id), log_file)
    with tf.Session() as sess:
        model = ContinuousMLP(sess, env.action_space,
                              gym_space_vectorizer(env.observation_space))
        roller = BasicRoller(env, model, min_episodes=4, min_steps=500)
        sess.run(tf.global_variables_initializer())
        trainer = CMATrainer(sess, scale=param_scale)
        steps = 0
        rewards = []
        while steps < timesteps:
            sub_steps, sub_rewards = trainer.train(roller)
            steps += sub_steps
            rewards.extend(sub_rewards)
            print('%s: steps=%d mean=%f batch_mean=%f' %
                  (env_id, steps, np.mean(rewards), np.mean(sub_rewards)))