예제 #1
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
예제 #2
0
def AC_train(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
             max_path_length, learning_rate, num_target_updates,
             num_grad_steps_per_target_update, animate, logdir,
             normalize_advantages, seed, n_layers, size):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(PG_train)[0]
    # params = {k: locals()[k] if k in locals() else None for k in args}
    params = locals()
    print(params)
    logz.save_params(params)

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # initialize Policy Gradient Agent
    network_args = {
        'n_layers': n_layers,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update
    }
    env_args = {
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
    }
    sample_traj_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }
    estimate_return_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    # Agent
    agent = ACAgent(network_args, env_args, sample_traj_args,
                    estimate_return_args)

    agent.build_computation_graph()
    agent.init_tf_sess()

    # start training
    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        paths, timesteps_this_batch = agent.sample_trajs(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate(
            [path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])

        agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        agent.update_actor(ob_no, ac_na, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [len(path["reward"]) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
예제 #3
0
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--layer_norm', type=bool, default=True)
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--buffer_size', type=int, default=1000000)
    parser.add_argument('--pop_size', type=int, default=16)
    parser.add_argument('--elite_size', type=int, default=16)
    parser.add_argument('--max_ep_len', type=int, default=1000)

    parser.add_argument('--alpha', type=float, default=0.5)
    parser.add_argument('--beta', type=float, default=2.)
    parser.add_argument('--sigma', type=float, default=0.1)
    parser.add_argument('--k', type=float, default=10)

    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--save_freq', type=int, default=5)
    parser.add_argument('--start_epoch', type=int, default=1)
    parser.add_argument('--rl_train_steps', type=int, default=1000)
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--dir_path', type=str, default='results_v3/')

    args = parser.parse_args()

    output_path = args.dir_path
    for seed in range(1, 11):
        args.seed = seed
        args.dir_path = get_output_folder(output_path, args.env, args.seed)
        logz.configure_output_dir(args.dir_path)
        logz.save_params(vars(args))
        gesrl = GESRL()
        gesrl.train()
예제 #4
0
def TD3_train(env,
              logdir='.',
              actor_critic=actor_critic,
              iterations=600000,
              replay_size=int(1e6),
              gamma=0.99,
              polyak=0.995,
              actor_lr=1e-3,
              critic_lr=1e-3,
              batch_size=100,
              start_steps=10000,
              act_noise=0.1,
              target_noise=0.2,
              noise_clip=0.5,
              policy_delay=4):

    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(PG_train)[0]
    # params = {k: locals()[k] if k in locals() else None for k in args}
    params = locals()
    print(params)
    logz.save_params(params)

    td3 = TD3Agent(env, actor_critic, gamma, polyak, actor_lr, critic_lr,
                   act_noise)

    td3.build_computation_graph()
    td3.init_tf_sess()
    td3.graph_initialization()

    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]

    replay_buffer = ReplayBuffer(ob_dim, ac_dim, replay_size)

    start_time = time.time()
    ob = env.reset()
    ac, rew, done = 0, 0, 0
    actor_loss = []
    critic_loss = []

    for ii in range(iterations):

        if ii < start_steps:
            ac = env.action_space.sample()
        else:
            ac = td3.sample_action(ob)

        ob_next, rew, done = env.step(ac)

        replay_buffer.store(ob, ac, rew, ob_next, done)

        if done is True:
            ob = env.reset()

        # if iteration < start_step, only put steps into buffer
        if ii < start_steps:
            continue

        batch = replay_buffer.sample_batch(batch_size=batch_size)

        # update critic
        a_loss = td3.update_critic(batch['obs1'], batch['obs2'], batch['acts'],
                                   batch['rews'], batch['done'])
        actor_loss.append(a_loss)

        if ii % policy_delay == 0:  # Delayed actor update and target update

            # update actor and target
            c_loss = td3.update_actor_and_target(batch['obs1'], batch['obs2'],
                                                 batch['acts'], batch['rews'],
                                                 batch['done'])
            critic_loss.append(c_loss)

        if ii % 10000 == 0:
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", ii)
            logz.log_tabular("AverageActorLoss", np.mean(np.array(actor_loss)))
            logz.log_tabular("AverageCriticLoss",
                             np.mean(np.array(critic_loss)))
            logz.log_tabular("AverageActorStd", np.std(np.array(actor_loss)))
            logz.log_tabular("AverageCriticStd", np.std(np.array(critic_loss)))
            logz.dump_tabular()
            logz.pickle_tf_vars()