예제 #1
0
    def __init__(self, env_seed,
                 env_name='',
                 shift=0,
                 policy='FC',
                 h_dim=64,
                 layers=2,
                 deltas=None,
                 rollout_length=1000,
                 delta_std=0.02,
                 num_evals=0,
                 ob_filter='NoFilter'):
        
        self.params = {}
        self.env_name = env_name
        self.params['env_name'] = env_name
        self.env = gym.make(env_name)
        self.params['ob_dim'] = self.env.observation_space.shape[0]
        self.params['ac_dim'] = self.env.action_space.shape[0]
        self.env.seed(0)

        self.params['h_dim'] = h_dim
        self.steps = rollout_length
                
        self.params['zeros'] = True
        self.params['seed'] = 0
        self.params['layers'] = layers
        self.shift = shift
        self.sigma = 1
        self.num_evals = num_evals
        self.params['ob_filter'] = ob_filter
        self.policy = get_policy(self.params)

        self.deltas = SharedNoiseTable(deltas, env_seed + 7)
        self.delta_std = delta_std
예제 #2
0
    def __init__(self, params):

        params['zeros'] = False
        self.agents = {
            i: get_policy(params, params['seed'] + 1000 * i)
            for i in range(params['num_agents'])
        }

        self.timesteps = 0

        self.w_reward = 1
        self.w_size = 0
        self.dists = 0

        self.adam_params = {i: [0, 0] for i in range(params['num_agents'])}

        self.buffer = []
        self.states = []
        self.embeddings = {i: [] for i in range(params['num_agents'])}
        self.best = {i: -9999 for i in range(params['num_agents'])}
        self.reward = {i: [-9999] for i in range(params['num_agents'])}
        self.min_dist = 0

        self.num_workers = params['num_workers']
        self.init_workers(params)
예제 #3
0
def main():
    args = options.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    gym_logger.setLevel(logging.CRITICAL)
    env_func = partial(get_env, args=args)
    env = get_env(args)
    reward_goal = get_goal(args)
    consecutive_goal_max = 10
    max_iteration = args.epoch
    all_rewards = []
    all_times = []
    all_totals = []
    for trial in range(args.n_trials):
        policy = policies.get_policy(args, env)
        if args.alg == 'ES':
            run_func = partial(envs.run_env_ES,
                               policy=policy,
                               env_func=env_func)
            alg = ESModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                learning_rate=args.lr,  # HYPERPARAMETER TODO:CHANGE
                threadcount=args.population_size)

        elif args.alg == 'PPO':
            run_func = partial(envs.run_env_PPO,
                               env_func=env_func)  # TODO: update
            alg = PPOModule(
                policy,
                run_func,
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                learning_rate=args.lr)  # TODO: CHANGE

        elif args.alg == 'ESPPO':
            run_func = partial(envs.run_env_PPO, env_func=env_func)

            alg = ESPPOModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                n_seq=args.n_seq,
                ppo_learning_rate=args.ppo_lr,
                es_learning_rate=args.es_lr,
                threadcount=args.population_size)

        elif args.alg == 'MAXPPO':
            run_func = partial(envs.run_env_PPO, env_func=env_func)

            alg = MaxPPOModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                n_seq=args.n_seq,
                ppo_learning_rate=args.ppo_lr,
                threadcount=args.population_size)

        elif args.alg == 'ALTPPO':
            run_func = partial(envs.run_env_PPO, env_func=env_func)

            alg = AltPPOModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                n_alt=args.n_alt,
                es_learning_rate=args.es_lr,
                ppo_learning_rate=args.ppo_lr,
                threadcount=args.population_size)

        if args.render:
            with open(os.path.join(args.directory, 'weights.pkl'), 'rb') as fp:
                weights = pickle.load(fp)
                policy.load_state_dict(weights)

            if args.alg == 'ES':
                total_reward = run_func(weights, stochastic=False, render=True)
            else:
                total_reward = run_func(policy,
                                        stochastic=False,
                                        render=True,
                                        reward_only=True)
            print(f"Total rewards from episode: {total_rewards}")
            return

        exp_dir = os.path.join(args.directory, alg.model_name)
        if not os.path.exists(exp_dir):
            os.makedirs(exp_dir)

        start = time.time()
        consecutive_goal_count = 0
        iteration = 0
        rewards = []
        while True:
            if iteration >= max_iteration:
                break
            weights = alg.step()
            if (iteration + 1) % 10 == 0:
                if args.alg == 'ES':
                    test_reward = run_func(weights,
                                           stochastic=False,
                                           render=False)
                else:
                    test_reward = run_func(policy,
                                           stochastic=False,
                                           render=False,
                                           reward_only=True)
                rewards.append(test_reward)
                print('iter %d. reward: %f' % (iteration + 1, test_reward))

                if consecutive_goal_max and reward_goal:
                    consecutive_goal_count = consecutive_goal_count + 1 if test_reward >= reward_goal else 0
                    if consecutive_goal_count >= consecutive_goal_max:
                        break
            iteration += 1
        end = time.time() - start
        if args.alg == 'ES':
            total_reward = run_func(weights, stochastic=False, render=False)
        else:
            total_reward = run_func(policy,
                                    stochastic=False,
                                    render=False,
                                    reward_only=True)
        all_rewards.append(rewards)
        all_times.append(end)
        all_totals.append(total_reward)
        print(f"Reward from final weights: {total_reward}")
        print(f"Time to completion: {end}")
    max_len = 0
    for rewards in all_rewards:
        if len(rewards) > max_len:
            max_len = len(rewards)
    for rewards in all_rewards:
        while len(rewards) < max_len:
            rewards.append(reward_goal)
        rewards = np.array(rewards)
    all_rewards = np.array(all_rewards)
    rewards_mean = np.mean(all_rewards, axis=0)
    rewards_std = np.std(all_rewards, axis=0)
    total_mean = np.mean(all_totals)
    time_mean = np.mean(all_times)
    plt.errorbar(np.arange(max_len),
                 rewards_mean,
                 yerr=rewards_std,
                 label='rewards')
    plt.legend(loc=4)
    plt.grid(True)
    plt.tight_layout()
    path = os.path.join(exp_dir, "rewards_plot.png")
    plt.savefig(path)
    plt.close()
    np.savetxt(os.path.join(exp_dir, 'rewards.txt'), rewards_mean)
    pickle.dump(weights, open(os.path.join(exp_dir, 'weights.pkl'), 'wb'))
    out_file = open(os.path.join(exp_dir, "results.txt"), 'w')
    print(f"Average rewards from final weights: {total_mean}")
    msg = f"Average rewards from final weights: {total_mean}"
    msg += "\n"
    print(f"Average time to completion: {time_mean}")
    msg += f"Average time to completion: {time_mean}"
    msg += "\n"
    print(f"Results saved at: {exp_dir}")
    out_file.write(msg)
    out_file.flush()
예제 #4
0
        from visdom import Visdom
        viz = Visdom()

    # Build Environment Template -> Lazy Evaluated Callable, for spawning environments
    env_template = build_env(args.env)

    # Build Distributed Environments
    envs = get_distributed_backend(env_template,
                                   args.num_processes,
                                   backend=args.distributed_backend)

    # Obtain Environment metadata
    metadata = envs.get_metadata()

    # Instantiate Policy
    policy = get_policy(args.policy, metadata)

    # Create agent, with the given training algorithm
    agent = get_algorithm(args.algorithm, policy, envs, args, visdom=viz)

    # Create Experience Buffer, with the environment metadata
    experience = Experience(metadata['max_episode_length'], args.num_processes,
                            metadata['obs_shape'], metadata['action_type'],
                            metadata['action_shape'])

    # Train agent
    agent.train(num_frames=args.num_frames)

    import IPython
    IPython.embed()
예제 #5
0
def main():
    print("line 1...")
    args = setup_utils.setup_and_load()
    print("passed line 1...")
    comm = MPI.COMM_WORLD
    print("passed line 2...")
    rank = comm.Get_rank()  #the rank of the process in a communicator
    print("passed line 3...")
    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)
    print("passed line 4,5...")
    # utils.setup_mpi_gpus()
    print("passed line 6...")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenvs = Config.NUM_ENVS
    nenvs = 1  #set to 1 temporarily
    # frame_stack_size = Config.FRAME_STACK_SIZE
    frame_stack_size = 1
    total_timesteps = int(5e6)
    save_interval = args.save_interval

    env_id = "MsPacman-v0"

    #copy from https://github.com/openai/baselines/blob/52255beda5f5c8760b0ae1f676aa656bb1a61f80/baselines/run.py#L33
    _game_envs = defaultdict(set)
    for env in gym.envs.registry.all():
        # TODO: solve this with regexes
        env_type = env._entry_point.split(':')[0].split('.')[-1]
        _game_envs[env_type].add(env.id)

    # env = make_vec_env(env_id, env_type, nenvs, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
    """
    save_interval = args.save_interval
    
    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        
        policy = policies.get_policy()
    """
    env = utils.make_general_env(env_id, env_type, nenvs, seed)
    # env = make_vec_env(env_id, env_type, nenvs, seed)
    # env = VecFrameStack(env, frame_stack_size)

    # env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        # env = wrappers.add_final_wrappers(env) #don't use wrappers anymore
        env = wrappers.add_final_wrappers(env)
        policy = policies.get_policy()

        # ppo2.learn(policy=policy,
        #             env=env,
        #             save_interval=save_interval,
        #             nsteps=Config.NUM_STEPS,
        #             nminibatches=Config.NUM_MINIBATCHES,
        #             lam=0.95,
        #             gamma=Config.GAMMA,
        #             noptepochs=Config.PPO_EPOCHS,
        #             log_interval=1,
        #             ent_coef=Config.ENTROPY_COEFF,
        #             lr=lambda f : f * Config.LEARNING_RATE,
        #             cliprange=lambda f : f * 0.2,
        #             total_timesteps=total_timesteps)
        ppo2.learn(policy=policy,
                   env=env,
                   save_interval=save_interval,
                   nsteps=int(1000),
                   nminibatches=100,
                   lam=0.95,
                   gamma=0.9,
                   noptepochs=16,
                   log_interval=1,
                   ent_coef=0.1,
                   lr=lambda f: f * 3e-4,
                   cliprange=lambda f: f * 0.2,
                   total_timesteps=total_timesteps)