def make_vec_envs(env_name, seed, num_processes, num_frame_stack=1, downsample=True, color=False, gamma=0.99, log_dir='./tmp/', device=torch.device('cpu')): Path(log_dir).mkdir(parents=True, exist_ok=True) envs = [ make_env(env_name, seed, i, log_dir, downsample, color) for i in range(num_processes) ] if len(envs) > 1: envs = SubprocVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack > 1: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) return envs
def main(): def make_env(): obs_type = retro.Observations.IMAGE # retro.Observations.RAM env = retro.make(game=game, state=state, scenario=scenario, record=record, players=players, obs_type=obs_type) # env = retro.make(game=game, state=state, scenario=scenario) env = wrap_deepmind_retro(env) return env venv = SubprocVecEnv([make_env] * 8) ppo2.learn( network='cnn', env=venv, total_timesteps=int(1e6), nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=0.1, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--game', default='Airstriker-Genesis') parser.add_argument('--state', default=retro.State.DEFAULT) parser.add_argument('--scenario', default=None) args = parser.parse_args() def make_env(): env = make_retro(game=args.game, state=args.state, scenario=args.scenario) env = wrap_deepmind_retro(env) return env venv = SubprocVecEnv([make_env] * 8) ppo2.learn( network='cnn', env=venv, total_timesteps=int(100e6), nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=0.1, )
def main(): def make_env(): obs_type = retro.Observations.IMAGE # retro.Observations.RAM env = retro.make(game=game, state=state, scenario=scenario, record=record, players=players, obs_type=obs_type) # env = retro.make(game=game, state=state, scenario=scenario) print(retro.__path__) env = wrap_deepmind_retro(env) return env base_dirname = os.path.join(currentdir, "results") demo_dir = os.path.join(currentdir, "human_data/demonstrations") demo_fname = os.path.join(demo_dir, "human_demonstration_10.pkl") if not os.path.exists(base_dirname): os.mkdir(base_dirname) dir_name = "pitfall_ppo2" dir_name += dir_note dir_name = addDateTime(dir_name) dir_name = os.path.join(base_dirname, dir_name) if not os.path.exists(dir_name): os.mkdir(dir_name) with open(demo_fname, "rb") as f: demos = pickle.load(f) valid_demos = [] for demo in demos: action, score = demo valid_action = np.array(action, dtype=np.float32).reshape(1, -1) valid_demos.append(valid_action) venv = SubprocVecEnv([make_env] * 1) performance = ppo2.learn(network='cnn', env=venv, total_timesteps=int(2e5), nsteps=32, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=16, log_interval=10, save_interval=500, ent_coef=.02, lr=lambda f: f * 3e-4, cliprange=0.2, base_path=dir_name, use_demo=True, demos=valid_demos, render_env=False) performance_fname = os.path.join(dir_name, "performance.p") with open(performance_fname, "wb") as f: pickle.dump(performance, f)
def main(): def make_env(): obs_type = retro.Observations.IMAGE # retro.Observations.RAM env = retro.make(game='Pitfall-Atari2600', state=retro.State.DEFAULT, scenario='scenario', record='.', players=1, obs_type=obs_type) env = wrap_deepmind_retro(env) return env base_dirname = os.path.join(currentdir, "results") #dir_name = "pitfall_ppo2_rl_baseline1" dir_name = "pitfall_ppo2testing_D191211_073544" dir_name = os.path.join(base_dirname, dir_name) load_path = os.path.join(dir_name, 'models/00781') venv = SubprocVecEnv([make_env] * 1) #Vectorized network = 'cnn' policy = build_policy(venv, network) nenvs = venv.num_envs # Get the nb of env # Get state_space and action_space ob_space = venv.observation_space ac_space = venv.action_space # Instantiate the model object model_fn = Model nsteps = 2048 nbatch = nenvs * nsteps nminibatches = 4 nbatch_train = nbatch // nminibatches model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=2048, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5) model.load(load_path) # Instantiate the runner object runner = Runner(env=venv, model=model, nsteps=nsteps, gamma=0.99, lam=0.95) # run the Runner and record video total_timesteps = int(1e4) nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): print("progress: ", update, "/", nupdates) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( )
def make_vec_env(nenvs=4, recurrent=False, grayscale=True, frame_stack=4, frame_diff=False): venv = SubprocVecEnv([lambda: make_env(rank, grayscale=grayscale) for rank in range(nenvs)]) # Uncomment this line in place of the one above for debugging. # venv = DummyVecEnv([lambda: make_env(0)]) if not recurrent: if frame_diff: venv = VecFrameDiff(venv) else: # Perform the frame stack at the vectorized environment level as opposed to at # the individual environment level. I think this allows you to communicate fewer # images across processes. venv = VecFrameStack(venv, frame_stack) return venv
def get_envs(factory: EnvFactory): num_envs = len(os.sched_getaffinity(0)) env = factory.make_env() def make_env(): def _thunk(): env = factory.make_env() return env return _thunk envs = [make_env() for _ in range(num_envs)] envs = SubprocVecEnv(envs) return env, envs
def make_envs(env_id, device, seed=0, num_envs=1, frame_stack=1, **kwargs): envs = [ env_generator(env_id, seed=seed + 1000 * i, **kwargs) for i in range(num_envs) ] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecPyTorch(envs, device) if frame_stack > 1: envs = VecPyTorchFrameStack(envs, frame_stack, device) return envs
def make_vec_envs(env_name, seed, num_processes, num_frame_stack=1, downsample=True, color=False, gamma=0.99, log_dir='./tmp/', device=torch.device('cpu'), use_extended_wrapper=False, train_mode="train_encoder"): try: Path(log_dir).mkdir(parents=True, exist_ok=True) except OSError as exc: if exc.errno != errno.EEXIST: raise pass envs = [ make_env(env_name, seed, i, log_dir, downsample, color, frame_stack=num_frame_stack, use_extended_wrapper=use_extended_wrapper, train_mode=train_mode) for i in range(num_processes) ] if len(envs) > 1: envs = SubprocVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) # if num_frame_stack > 1: # envs = VecPyTorchFrameStack(envs, num_frame_stack, device) return envs
def main(arguments: argparse) -> None: """ Main training loop. :param arguments: User input :return: """ n_steps = arguments.steps n_agents = arguments.envs print(f'Training {args.game} using {"cpu" if arguments.cpu else "gpu"}') print(f'Number of concurrent environments {args.envs}') print(f'Number of steps per batch {args.steps}') if arguments.model: print(f'Using existing model {arguments.model}') env = SubprocVecEnv( [make_env(env_id=arguments.game, rank=i) for i in range(n_agents)]) agent = DeepLearningAgent(observation_space=env.observation_space, action_space=int(env.action_space.n), n_envs=n_agents, n_steps=n_steps, model_path=arguments.model, use_cpu=arguments.cpu) # This is the current state (or observation) observations = reshape_observations(env.reset()) actions = agent.get_action(observations) initial_training_time = time.time() for ep in range(EPISODES): # Reset the frame counter each time the batch size is complete for i in range(n_steps): new_observations, rewards, done, info = env.step( actions.cpu().numpy()) new_observations = reshape_observations(new_observations) agent.train(s=observations, r=rewards, s_next=new_observations, a=actions, done=done, step=i) actions = agent.get_action(new_observations) observations = new_observations if ep % 100 == 0: fps = ((ep + 1) * n_steps * n_agents) / (time.time() - initial_training_time) print(f'FPS {fps}') env.close()
def main(): # Alter reward in scenario.json (C:\Users\Fergus\Anaconda3\envs\AIGym\Lib\site-packages\retro\data\stable\SonicTheHedgehog-Genesis) env = SubprocVecEnv([make_env_3]) obs = env.reset() # env = make_env_3() # env2 = make_env_4() print(env.observation_space) print(env.action_space.n) print(obs.shape) print(obs[0].shape) # obs = env2.reset() rew_mb = [] dones_mb = [] obs_mb = [] step = 0 while True: action = env.action_space.sample() obs, rew, done, info = env.step([0]) print("Step {} Reward: {}, Done: {}".format(step, rew, done)) rew_mb.append(rew) dones_mb.append(done) obs_mb.append(obs) env.render() step += 1 # obs = obs[1] / 255. # for i in range(4): # cv2.imshow('GrayScale'+str(i), np.squeeze(obs[:,:,i])) # cv2.waitKey(1) if done[0]: env.close() break rew_mb = np.array(rew_mb) dones_mb = np.array(dones_mb) obs_mb = np.array(obs_mb) print("Rewards: ", rew_mb) print(rew_mb.shape) print(dones_mb) print(dones_mb.shape) print(obs_mb.shape)
def main(): def make_env(): obs_type = retro.Observations.IMAGE # retro.Observations.RAM env = retro.make(game=game, state=state, scenario=scenario, record=record, players=players, obs_type=obs_type) # env = retro.make(game=game, state=state, scenario=scenario) print(retro.__path__) env = wrap_deepmind_retro(env) return env base_dirname = os.path.join(currentdir, "results") if not os.path.exists(base_dirname): os.mkdir(base_dirname) dir_name = "pitfall_ppo2" dir_name += dir_note dir_name = addDateTime(dir_name) dir_name = os.path.join(base_dirname, dir_name) if not os.path.exists(dir_name): os.mkdir(dir_name) venv = SubprocVecEnv([make_env] * 8) performance = ppo2.learn( network='cnn', env=venv, total_timesteps=int(2e5), nsteps=32, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=16, log_interval=10, save_interval=500, ent_coef=.02, lr=lambda f: f * 3e-4, cliprange=0.2, base_path=dir_name ) performance_fname = os.path.join(dir_name, "performance.p") with open(performance_fname, "wb") as f: pickle.dump(performance, f)
def make_rl_envs(env_id, seed, n_envs, device, frame_stack=4, add_video=False, add_frames=False, vid_path=None, **kwargs): envs = [env_generator(env_id, seed=seed+1000*i) for i in range(n_envs)] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if add_video: assert vid_path is not None envs = VecConcatVideo(envs, vid_path, ordered=True) elif add_frames: assert vid_path is not None envs = VecConcatVideo(envs, vid_path, ordered=False) envs = VecPyTorch(envs, device) if frame_stack > 1: envs = VecPyTorchFrameStack(envs, frame_stack, device) return envs
def make_vec_env(nenvs=4, recurrent=False, grayscale=True, frame_stack=4, num_agents=2): venv = SubprocVecEnv([ lambda: make_env(rank, grayscale=grayscale, num_agents=num_agents) for rank in range(nenvs) ]) # Uncomment this line in place of the one above for debugging. # venv = DummyVecEnv([lambda: make_env(0)]) if not recurrent: # Perform the frame stack at the vectorized environment level as opposed to at # the individual environment level. I think this allows you to communicate fewer # images across processes. venv = VecFrameStack(venv, frame_stack) venv = MultiAgentToSingleAgent(venv, num_agents=num_agents) venv = VecMonitor(venv, filename=monitor_filepath) return venv
def main(): parser = argparse.ArgumentParser() parser.add_argument('--game', default='Airstriker-Genesis') parser.add_argument('--state', default=retro.State.DEFAULT) parser.add_argument('--scenario', default=None) args = parser.parse_args() def make_env(): #if I wanna record the game video, then add attribute "record = '.'" env = make_retro(game=args.game, state=args.state, scenario=args.scenario, record = '.') env = wrap_deepmind_retro(env) env = Monitor(env, None, True) # And activate all of the under codes env.reset() while True: _obs, _rew, done, _info = env.step(env.action_space.sample()) if done: break return env venv = SubprocVecEnv([make_env] * 8) ppo2.learn( network='cnn', env=venv, total_timesteps=int(100e6), nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f : f * 2.5e-4, cliprange=0.1, load_path='/home/dhodwo/PycharmProjects/untitled/check_pts/checkpoints/1' )
def main7(): retro.data.add_custom_integration("custom") def wrap_deepmind_n64(env, reward_scale=1 / 100.0, frame_stack=1, grayscale=False): env = MaxAndSkipEnv(env, skip=4) env = WarpFrame(env, width=150, height=100, grayscale=grayscale) env = FrameStack(env, frame_stack) env = ScaledFloatFrame(env) env = RewardScaler(env, scale=1 / 100.0) return env def make_env(): retro.data.add_custom_integration("custom") env = retro.n64_env.N64Env(game="SuperSmashBros-N64", use_restricted_actions=retro.Actions.MULTI_DISCRETE, inttype=retro.data.Integrations.CUSTOM, obs_type=retro.Observations.IMAGE) env = wrap_deepmind_n64(env) return env gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) nenvs = 2 # env = DummyVecEnv([make_env] * nenvs) env = SubprocVecEnv([make_env] * nenvs) network_name = "impala_cnn_lstm" policy = build_policy(env, network_name) recurrent = "lstm" in network_name ob_space = env.observation_space ac_space = env.action_space nsteps = 10 nminibatches = 2 nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, comm=None, mpi_rank_weight=1) runner = Runner(env=env, model=model, nsteps=10, gamma=.99, lam=.95) env.reset() num_steps = 20000 action = [np.array([0, 0, 0]), np.array([0, 0, 0])] for i in range(num_steps): sys.stdout.write(f"\r{i+1} / {num_steps}") action = [env.action_space.sample() for _ in range(nenvs)] obs, reward, dones, info = env.step(action) # env.reset(dones) # env.render() if i % 50 == 0: if recurrent: fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 12)) else: fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(20, 12)) for env_index in range(nenvs): if recurrent: axs[env_index].imshow(obs[env_index, :, :, :]) else: for j in range(4): row = env_index * 2 + j // 2 col = j % 2 print(row) print(col) axs[row, col].imshow(obs[env_index, :, :, j]) plt.show() plt.close() end = time.time() print(end - start) return env
def main6(): retro.data.add_custom_integration("custom") def wrap_deepmind_n64(env, reward_scale=1 / 100.0, frame_stack=1): env = MaxAndSkipEnv(env, skip=4) env = WarpFrame(env, width=450, height=300, grayscale=False) env = FrameStack(env, frame_stack) env = ScaledFloatFrame(env) env = RewardScaler(env, scale=reward_scale) return env def make_env(): retro.data.add_custom_integration("custom") state = "ssb64.pikachu.level9dk.dreamland.state" env = retro.n64_env.N64Env(game="SuperSmashBros-N64", use_restricted_actions=retro.Actions.MULTI_DISCRETE, inttype=retro.data.Integrations.CUSTOM, obs_type=retro.Observations.IMAGE, state=state) env = wrap_deepmind_n64(env) return env gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # env = make_env() env = SubprocVecEnv([make_env] * 1) # env = DummyVecEnv([make_env] * 1) env.reset() num_steps = 20000 # action = [np.array([0, 0, 0])] # action = [env.action_space.sample() for _ in range(2)] for i in range(num_steps): sys.stdout.write(f"\r{i+1} / {num_steps}") # action = env.action_space.sample() action = [env.action_space.sample() for _ in range(1)] obs, reward, done, info = env.step(action) print(f"\nreward: {reward} done: {done}") # input() if (isinstance(done, bool) and done) or (isinstance(done, list) and all(done)): env.reset() # env.render() if i % 50 == 0: image = Image.fromarray((obs[0] * 255).astype(np.uint8)) image.save("/home/wulfebw/Desktop/color.png") plt.imshow(obs[0, :, :, 0]) # fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) # for j in range(1): # row = j // 2 # col = j % 2 # print(row) # print(col) # axs[row, col].imshow(obs[:, :]) plt.show() plt.close() end = time.time() print(end - start) return env
def main(hParams, n_run, total_timesteps): nsteps = hParams['N_STEPS'] n_epochs = hParams['N_EPOCHS'] n_train = 4 n_minibatch = 8 log_loss_int = 1 save_int = 5 test_int = 10 test_episodes = 5 gamma = 0.95 lr = hParams[HP_LEARNING_RATE] vf_coef = hParams[HP_VF_COEF] ent_coef = hParams[HP_ENT_COEF] save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str(ent_coef) testenvfn = SonicEnv.make_env_3 current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/sonic_long_test/run-' + str(n_run) summ_writer = tf.summary.create_file_writer(log_dir) env = SubprocVecEnv([SonicEnv.make_env_3]) nenv = env.num_envs state_size = env.observation_space.shape num_actions = env.action_space.n pgnet = PGNetwork(state_size, num_actions, lr=lr, vf_coef=vf_coef, ent_coef=ent_coef) # Runner used to create training data runner = SonicEnvRunner(env, pgnet, nsteps, gamma) # total_timesteps = int(n_epochs * nsteps * nenv) nbatch = nenv * nsteps print("Total updates to run: ", total_timesteps // nbatch) for update in range(1, total_timesteps // nbatch + 1): print("\nUpdate #{}:".format(update)) states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run( ) for _ in range(n_train): indices = np.arange(nbatch) np.random.shuffle(indices) for start in range(0, nbatch, nbatch // n_minibatch): end = start + nbatch // n_minibatch bind = indices[start:end] policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient( states_mb[bind], actions_mb[bind], rewards_mb[bind], values_mb[bind]) WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update) r2 = 1 - (np.var(rewards_mb - values_mb) / np.var(rewards_mb)) with summ_writer.as_default(): tf.summary.scalar("PolicyLoss", policy_loss, step=update) tf.summary.scalar("EntropyLoss", entropy_loss, step=update) tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update) tf.summary.scalar("Loss", loss, step=update) tf.summary.scalar("R-squared", r2, step=update) if update % log_loss_int == 0: print("PolicyLoss:", policy_loss) print("EntropyLoss: ", entropy_loss) print("ValueFunctionLoss: ", vf_loss) print("Loss: ", loss) if update % save_int == 0: pgnet.model.save_weights('sonic_long_test/' + save_dir + '/my_checkpoint') print("Model Saved") if update % test_int == 0: TestRewardWriter(summ_writer, testenvfn, pgnet, test_episodes, global_step=update) with summ_writer.as_default(): hp.hparams(hParams) env.close()
def train_multi(env: gym.Env, agent: AgentBase, settings: TrainSettings): # Initialize variables for logging. scores = ContiguousRingBuffer(capacity=128) max_avg_score = -np.inf # Ensure settings.directory exists for logging / saving. os.makedirs(settings.directory, exist_ok=True) # Optionally load from existing checkpoint. if settings.load: agent.load(settings.load) # Instantiate vectorized environment. if isinstance(env, SubprocVecEnv): # No further action is required. pass elif isinstance(env, gym.Env): # Cannot logger.error("Unable to broadcast single environment {}".format(env)) else: # Assume that env is a constructor function. env = SubprocVecEnv( [functools.partial(env, i) for i in range(settings.num_env)]) # Initialize handlers for data collection. total_rewards = np.zeros(settings.num_env, dtype=np.float32) dones = np.zeros(settings.num_env, dtype=np.uint8) states = env.reset() # FIXME(yycho0108): EPS should be configurable. # eps = LinearEpsilon(0.8 * settings.num_episodes) eps = ExponentialEpsilon(0.99, 0.05, 0.8 * settings.num_episodes, True) i_episode = 0 pbar = tqdm(total=settings.num_episodes) while i_episode < settings.num_episodes: # Reset the environments that are done, so that # At each moment the agent is always dealing with a live-state. # SubprocVecEnv.reset() does not allow granular control. for s, d, e in zip(states, dones, env.remotes): if not d: continue e.send(('reset', None)) # FIXME(yycho0108): Applying a reshape here as e.recv() # Was seen to return a list for whatever reason. # May silently allow an error to pass through. s[:] = np.reshape(e.recv(), s.shape) scores.extend(total_rewards[dones == True]) total_rewards[dones == True] = 0.0 num_done = dones.sum() dones[:] = False # Process each state and interact with each env. actions = agent.select_action(states, eps(i_episode)) next_states, rewards, dones, _ = env.step(actions) agent.step(states, actions, rewards, next_states, dones) total_rewards += rewards states = next_states # Increment episode counts accordingly. pbar.set_postfix(score=np.mean(scores.array)) # Optionally enable printing episode statistics. # The logging happens at each crossing of the discretized log-period boundary. if count_boundaries(i_episode, num_done, settings.log_period) > 0: # Compute statistilcs. avg_score = np.mean(scores.array) if avg_score > max_avg_score: max_avg_score = avg_score # Print statistics. logger.info( "Episode {}/{} | Max Avg: {:.2f} | Eps : {:.2f}".format( i_episode, settings.num_episodes, max_avg_score, eps(i_episode))) if isinstance(agent.memory, PrioritizedReplayBuffer): logger.info('mp : {} vs {}'.format( agent.memory.max_priority, agent.memory.memory.array['priority'].max())) # Save agent checkpoint as well. if count_boundaries(i_episode, num_done, settings.save_period) > 0: agent.save(settings.directory, i_episode + num_done) i_episode += num_done pbar.update(num_done) pbar.close() # Save results and return. agent.save(settings.directory) return scores
def main(): os.environ['OPENAI_LOGDIR'] = LOG_PATH number_of_environments = 1 venv = SubprocVecEnv([make_sf2_env] * number_of_environments) video_path = './recording' video_length = 5 * 60 * FPS venv = VecVideoRecorder( venv, video_path, record_video_trigger=lambda step: step % video_length == 0, video_length=video_length) # ppo2.learn( # network='mlp', # env=venv, # # eval_env=venv, # total_timesteps=40000000, # nsteps=128, # 5 * FPS, # nminibatches=number_of_environments, # lam=0.95, # gamma=0.99, # noptepochs=3, # log_interval=1000, # ent_coef=.01, # lr=lambda alpha: 2.5e-4 * alpha, # vf_coef=1.0, # cliprange=lambda alpha: 0.1 * alpha, # save_interval=1000, # # load_path=MODEL_PATH, # # neuronal network parameters # activation=tf.nn.relu, # num_layers=2, # 4, 2 # num_hidden=48, # 64, 64 # layer_norm=False # ) acer.learn( network='mlp', # 'impala_cnn' env=venv, total_timesteps=40000000, nsteps=128, # 5 * FPS, q_coef=1.0, ent_coef=0.001, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=1000, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, delta=1, alpha=0.99, # load_path=MODEL_PATH, save_interval=1000, # neuronal network parameters activation=tf.nn.relu, num_layers=2, # 4, 2 num_hidden=48, # 64, 64 layer_norm=False)
:param env_id: (str) the environment ID :param num_env: (int) the number of environments you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_global_seeds(seed) return _init if __name__ == "__main__": envs = SubprocVecEnv([make_env(env_name, i) for i in range(num_envs)]) env = gym.make(env_name) num_inputs = envs.observation_space.shape num_outputs = envs.action_space.shape model = ActorCritic(num_inputs[0], num_outputs[0]).to(device) if os.path.isfile(modelpath): model.load_state_dict(torch.load(modelpath)) ppo = PPO(model=model, envs=envs, device=device, lr=lr, modelpath=modelpath) if not play_mode:
def main(hParams, n_run): nsteps = hParams['N_STEPS'] nenv = hParams[HP_N_ENV] n_epochs = hParams['N_EPOCHS'] total_timesteps = int(n_epochs * nsteps * nenv) nbatch = nenv * nsteps update_int = 1 save_int = 5 test_int = 10 gamma = 0.99 lr = hParams[HP_LEARNING_RATE] vf_coef = hParams[HP_VF_COEF] ent_coef = hParams[HP_ENT_COEF] save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str( ent_coef) + 'env' + str(nenv) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/cart_hparam_tuning/run-' + str(n_run) summ_writer = tf.summary.create_file_writer(log_dir) envfn = lambda: gym.make('CartPole-v1') env = SubprocVecEnv([envfn] * nenv) state_size = env.observation_space.shape num_actions = env.action_space.n pgnet = SimplePGNet(state_size, num_actions, learning_rate=lr, vf_coef=vf_coef, ent_coef=ent_coef) runner = SonicEnvRunner(env, pgnet, nsteps, gamma) print("Total updates to run: ", total_timesteps // nbatch) for update in range(1, total_timesteps // nbatch + 1): print("\nUpdate #{}:".format(update)) states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run( ) tf.summary.trace_on(graph=True) policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient( states_mb, actions_mb, rewards_mb, values_mb) if update == 1: with summ_writer.as_default(): tf.summary.trace_export(name="grad_trace", step=0) WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update) with summ_writer.as_default(): tf.summary.scalar("PolicyLoss", policy_loss, step=update) tf.summary.scalar("EntropyLoss", entropy_loss, step=update) tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update) tf.summary.scalar("Loss", loss, step=update) if update % update_int == 0: print("PolicyLoss:", policy_loss) print("EntropyLoss: ", entropy_loss) print("ValueFunctionLoss: ", vf_loss) print("Loss: ", loss) if update % save_int == 0: pgnet.model.save_weights('cart_hparams_tuning_models/' + save_dir + '/my_checkpoint') print("Model Saved") if update % test_int == 0: test_rewards = TestRewardWriter(summ_writer, envfn, pgnet, 20, global_step=update) print("Test Rewards: ", test_rewards) with summ_writer.as_default(): hp.hparams(hParams) env.close()