def __init__(self, env_seed, env_name='', shift=0, policy='FC', h_dim=64, layers=2, deltas=None, rollout_length=1000, delta_std=0.02, num_evals=0, ob_filter='NoFilter'): self.params = {} self.env_name = env_name self.params['env_name'] = env_name self.env = gym.make(env_name) self.params['ob_dim'] = self.env.observation_space.shape[0] self.params['ac_dim'] = self.env.action_space.shape[0] self.env.seed(0) self.params['h_dim'] = h_dim self.steps = rollout_length self.params['zeros'] = True self.params['seed'] = 0 self.params['layers'] = layers self.shift = shift self.sigma = 1 self.num_evals = num_evals self.params['ob_filter'] = ob_filter self.policy = get_policy(self.params) self.deltas = SharedNoiseTable(deltas, env_seed + 7) self.delta_std = delta_std
def __init__(self, params): params['zeros'] = False self.agents = { i: get_policy(params, params['seed'] + 1000 * i) for i in range(params['num_agents']) } self.timesteps = 0 self.w_reward = 1 self.w_size = 0 self.dists = 0 self.adam_params = {i: [0, 0] for i in range(params['num_agents'])} self.buffer = [] self.states = [] self.embeddings = {i: [] for i in range(params['num_agents'])} self.best = {i: -9999 for i in range(params['num_agents'])} self.reward = {i: [-9999] for i in range(params['num_agents'])} self.min_dist = 0 self.num_workers = params['num_workers'] self.init_workers(params)
def main(): args = options.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) gym_logger.setLevel(logging.CRITICAL) env_func = partial(get_env, args=args) env = get_env(args) reward_goal = get_goal(args) consecutive_goal_max = 10 max_iteration = args.epoch all_rewards = [] all_times = [] all_totals = [] for trial in range(args.n_trials): policy = policies.get_policy(args, env) if args.alg == 'ES': run_func = partial(envs.run_env_ES, policy=policy, env_func=env_func) alg = ESModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER learning_rate=args.lr, # HYPERPARAMETER TODO:CHANGE threadcount=args.population_size) elif args.alg == 'PPO': run_func = partial(envs.run_env_PPO, env_func=env_func) # TODO: update alg = PPOModule( policy, run_func, n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, learning_rate=args.lr) # TODO: CHANGE elif args.alg == 'ESPPO': run_func = partial(envs.run_env_PPO, env_func=env_func) alg = ESPPOModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, n_seq=args.n_seq, ppo_learning_rate=args.ppo_lr, es_learning_rate=args.es_lr, threadcount=args.population_size) elif args.alg == 'MAXPPO': run_func = partial(envs.run_env_PPO, env_func=env_func) alg = MaxPPOModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, n_seq=args.n_seq, ppo_learning_rate=args.ppo_lr, threadcount=args.population_size) elif args.alg == 'ALTPPO': run_func = partial(envs.run_env_PPO, env_func=env_func) alg = AltPPOModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, n_alt=args.n_alt, es_learning_rate=args.es_lr, ppo_learning_rate=args.ppo_lr, threadcount=args.population_size) if args.render: with open(os.path.join(args.directory, 'weights.pkl'), 'rb') as fp: weights = pickle.load(fp) policy.load_state_dict(weights) if args.alg == 'ES': total_reward = run_func(weights, stochastic=False, render=True) else: total_reward = run_func(policy, stochastic=False, render=True, reward_only=True) print(f"Total rewards from episode: {total_rewards}") return exp_dir = os.path.join(args.directory, alg.model_name) if not os.path.exists(exp_dir): os.makedirs(exp_dir) start = time.time() consecutive_goal_count = 0 iteration = 0 rewards = [] while True: if iteration >= max_iteration: break weights = alg.step() if (iteration + 1) % 10 == 0: if args.alg == 'ES': test_reward = run_func(weights, stochastic=False, render=False) else: test_reward = run_func(policy, stochastic=False, render=False, reward_only=True) rewards.append(test_reward) print('iter %d. reward: %f' % (iteration + 1, test_reward)) if consecutive_goal_max and reward_goal: consecutive_goal_count = consecutive_goal_count + 1 if test_reward >= reward_goal else 0 if consecutive_goal_count >= consecutive_goal_max: break iteration += 1 end = time.time() - start if args.alg == 'ES': total_reward = run_func(weights, stochastic=False, render=False) else: total_reward = run_func(policy, stochastic=False, render=False, reward_only=True) all_rewards.append(rewards) all_times.append(end) all_totals.append(total_reward) print(f"Reward from final weights: {total_reward}") print(f"Time to completion: {end}") max_len = 0 for rewards in all_rewards: if len(rewards) > max_len: max_len = len(rewards) for rewards in all_rewards: while len(rewards) < max_len: rewards.append(reward_goal) rewards = np.array(rewards) all_rewards = np.array(all_rewards) rewards_mean = np.mean(all_rewards, axis=0) rewards_std = np.std(all_rewards, axis=0) total_mean = np.mean(all_totals) time_mean = np.mean(all_times) plt.errorbar(np.arange(max_len), rewards_mean, yerr=rewards_std, label='rewards') plt.legend(loc=4) plt.grid(True) plt.tight_layout() path = os.path.join(exp_dir, "rewards_plot.png") plt.savefig(path) plt.close() np.savetxt(os.path.join(exp_dir, 'rewards.txt'), rewards_mean) pickle.dump(weights, open(os.path.join(exp_dir, 'weights.pkl'), 'wb')) out_file = open(os.path.join(exp_dir, "results.txt"), 'w') print(f"Average rewards from final weights: {total_mean}") msg = f"Average rewards from final weights: {total_mean}" msg += "\n" print(f"Average time to completion: {time_mean}") msg += f"Average time to completion: {time_mean}" msg += "\n" print(f"Results saved at: {exp_dir}") out_file.write(msg) out_file.flush()
from visdom import Visdom viz = Visdom() # Build Environment Template -> Lazy Evaluated Callable, for spawning environments env_template = build_env(args.env) # Build Distributed Environments envs = get_distributed_backend(env_template, args.num_processes, backend=args.distributed_backend) # Obtain Environment metadata metadata = envs.get_metadata() # Instantiate Policy policy = get_policy(args.policy, metadata) # Create agent, with the given training algorithm agent = get_algorithm(args.algorithm, policy, envs, args, visdom=viz) # Create Experience Buffer, with the environment metadata experience = Experience(metadata['max_episode_length'], args.num_processes, metadata['obs_shape'], metadata['action_type'], metadata['action_shape']) # Train agent agent.train(num_frames=args.num_frames) import IPython IPython.embed()
def main(): print("line 1...") args = setup_utils.setup_and_load() print("passed line 1...") comm = MPI.COMM_WORLD print("passed line 2...") rank = comm.Get_rank() #the rank of the process in a communicator print("passed line 3...") seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) print("passed line 4,5...") # utils.setup_mpi_gpus() print("passed line 6...") config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs = Config.NUM_ENVS nenvs = 1 #set to 1 temporarily # frame_stack_size = Config.FRAME_STACK_SIZE frame_stack_size = 1 total_timesteps = int(5e6) save_interval = args.save_interval env_id = "MsPacman-v0" #copy from https://github.com/openai/baselines/blob/52255beda5f5c8760b0ae1f676aa656bb1a61f80/baselines/run.py#L33 _game_envs = defaultdict(set) for env in gym.envs.registry.all(): # TODO: solve this with regexes env_type = env._entry_point.split(':')[0].split('.')[-1] _game_envs[env_type].add(env.id) # env = make_vec_env(env_id, env_type, nenvs, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) """ save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() """ env = utils.make_general_env(env_id, env_type, nenvs, seed) # env = make_vec_env(env_id, env_type, nenvs, seed) # env = VecFrameStack(env, frame_stack_size) # env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): # env = wrappers.add_final_wrappers(env) #don't use wrappers anymore env = wrappers.add_final_wrappers(env) policy = policies.get_policy() # ppo2.learn(policy=policy, # env=env, # save_interval=save_interval, # nsteps=Config.NUM_STEPS, # nminibatches=Config.NUM_MINIBATCHES, # lam=0.95, # gamma=Config.GAMMA, # noptepochs=Config.PPO_EPOCHS, # log_interval=1, # ent_coef=Config.ENTROPY_COEFF, # lr=lambda f : f * Config.LEARNING_RATE, # cliprange=lambda f : f * 0.2, # total_timesteps=total_timesteps) ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=int(1000), nminibatches=100, lam=0.95, gamma=0.9, noptepochs=16, log_interval=1, ent_coef=0.1, lr=lambda f: f * 3e-4, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)