def main(): env = env_fn() print(env.observation_space) obs_size, = env.observation_space.shape act_size = env.action_space.n device = "cuda" policy_fn = lambda device: lambda: FCPolicy(obs_size, act_size, 512, device ) data_store_size = 12800 batch_size = 64 n_envs = 8 n_cpus = 0 logger = make_logger("log") save_folder = "basic_test_save" run_loop( logger, lambda: DQNLearner(policy_fn("cuda"), 0.001, 0.99, logger, device), OccasionalUpdate(10, policy_fn("cpu")), lambda: StatelessActor(policy_fn("cuda")()), env_fn, Saver(save_folder), lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size), data_store_size, batch_size, num_env_ids=n_envs, log_frequency=5, num_cpus=n_cpus, act_steps_until_learn=8000)
def main(): env = env_fn() cpu_count = mp.cpu_count() # cpu_count = 0 num_envs = 8 num_cpus = 0 num_targets = 1 model_features = 512 data_store_size = 10000 batch_size = 512 max_grad_norm = 0.1 device = "cuda" num_actors = 1 max_learn_steps = 100000 save_folder = "savedata/" def policy_fn_dev(device): policy = SACPolicy(env.observation_space, env.action_space, device) # load_latest(save_folder, policy) return policy priority_updater = NoUpdater() logger = make_logger("log") run_loop( logger, lambda: SACLearner(policy_fn_dev(device), gamma=0.99, T_max=max_learn_steps, logger=logger, device=device), OccasionalUpdate(100, lambda: policy_fn_dev("cpu")), lambda: StatelessActor(policy_fn_dev(device)), env_fn, Saver(save_folder), # MakeCPUAsyncConstructor(n_cpus), lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size), data_store_size, batch_size, num_cpus=num_cpus, num_env_ids=num_envs, priority_updater=priority_updater, log_frequency=5, max_learn_steps=max_learn_steps, # act_steps_until_learn=10000, # num_actors=num_actors, )
def main(): n_envs = 8 env_id = "CartPole-v0" # def env_fn(): # return continuous_actions(gym.make(env_id)) env = env_fn() #print(env.observation_space) #obs_size, = env.observation_space.shape #act_size = env.action_space.n sb3_env = SpaceWrap(env) # print(sb3_env.action_space) # exit(0) n_timesteps = 1000 save_path = "log" eval_freq = 50 tensorboard_log = "" sb3_learner_fn = lambda device: TD3(env=sb3_env, tensorboard_log=tensorboard_log, policy=MlpPolicy, device=device) learner_fn = lambda: SB3LearnWrapper(sb3_learner_fn("cuda")) policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cuda").policy) example_policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cpu").policy) #learner = (model) learn_rate = lambda x: 0.01 #policy = SB3Wrapper(model.policy)#MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu")) data_store_size = 12800 batch_size = 512 logger = make_logger("log") run_loop( logger, learner_fn, #A2CLearner(policy, 0.001, 0.99, logger, device), OccasionalUpdate(10, example_policy_fn()), lambda: StatelessActor(policy_fn()), env_fn, MakeCPUAsyncConstructor(4), lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size), data_store_size, batch_size, n_envs=16, log_frequency=5)
def main(): env = env_fn() cpu_count = mp.cpu_count() # cpu_count = 0 num_envs = 1 num_cpus = 0 num_targets = 1 data_store_size = 10000 batch_size = 64 device="cpu" num_actors = 1 max_learn_steps = 40000 save_folder = "savedata/" def policy_fn_dev(device): policy = DQNPolicy(env, logger, device) # load_latest(save_folder, policy) return policy priority_updater = NoUpdater() logger = make_logger("log") policy = policy_fn_dev(device) run_loop( logger, lambda: DQNLearner(policy, logger, env.action_space.n, device=device), NoUpdate(), # OccasionalUpdate(100, lambda: policy_fn_dev("cpu")), lambda: StatelessActor(policy), env_fn, Saver(save_folder), # MakeCPUAsyncConstructor(n_cpus), lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size), data_store_size, batch_size, act_steps_until_learn=1000, steps_per_update=1, num_cpus=num_cpus, num_env_ids=num_envs, priority_updater=priority_updater, log_frequency=5, max_learn_steps=max_learn_steps, # act_steps_until_learn=10000, # num_actors=num_actors, )
def main(): n_envs = 8 env_id = "CartPole-v0" # def env_fn(): # return continuous_actions(gym.make(env_id)) env = env_fn() #print(env.observation_space) #obs_size, = env.observation_space.shape #act_size = env.action_space.n sb3_env = SpaceWrap(env) # print(sb3_env.action_space) # exit(0) n_timesteps = 1000 save_path = "log" eval_freq = 50 tensorboard_log = "" model = TD3(env=sb3_env, tensorboard_log=tensorboard_log, policy=MlpPolicy) learner = SB3LearnWrapper(model) device = "cpu" learn_rate = lambda x: 0.01 policy = SB3Wrapper( model.policy ) #MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu")) data_store_size = 12800 batch_size = 16 logger = make_logger("log") run_loop( logger, lambda: learner, #A2CLearner(policy, 0.001, 0.99, logger, device), NoUpdate(), #.10, policy), lambda: StatelessActor(policy), env_fn, ConcatVecEnv, lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size), data_store_size, batch_size, n_envs=16, log_frequency=5)
def main(): env = env_fn() print(env.observation_space) obs_size, = env.observation_space.shape act_size = env.action_space.n device = "cuda" policy_fn = lambda: FCPolicy(obs_size, act_size, 64, device) data_store_size = 128000 batch_size = 64 logger = make_logger("log") run_loop(logger, lambda: DQNLearner(policy_fn(), 0.001, 0.99, logger, device), OccasionalUpdate(10, FCPolicy(obs_size, act_size, 64, "cpu")), lambda: StatelessActor(policy_fn()), env_fn, SyncVectorEnv, lambda: TransitionAdder(env.observation_space, env.action_space), DensitySampleScheme(data_store_size), data_store_size, batch_size, n_envs=32, log_frequency=5)
def main(): save_folder = "savedata/" def policy_fn_dev(device,is_learner=False): device = torch.device(device) policy = Agent(device, args, env,logger,priority_updater,is_learner=is_learner) load_latest(save_folder, policy) return policy data_store_size = 500000 batch_size = 256 args.batch_size = batch_size n_envs = 32 n_cpus = 32 priority_updater = PriorityUpdater() logger = make_logger("log") print("cpu create") print("cpu finish create") run_loop( logger, lambda: policy_fn_dev("cuda:0",is_learner=True),#DDPGLearner(policy_fn, reward_normalizer_fn, 0.001, 0.99, 0.1, logger, priority_updater, device), OccasionalUpdate(100, lambda: policy_fn_dev("cpu")), lambda: StatelessActor(policy_fn_dev("cuda:0")), env_fn, Saver(save_folder), #MakeCPUAsyncConstructor(n_cpus), lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size),#, alpha=0.5, beta_fn=lambda x:0.), data_store_size, batch_size, act_steps_until_learn=200000, num_env_ids=n_envs, num_cpus=n_cpus, priority_updater=priority_updater, log_frequency=5., max_learn_steps=10000000, ) print("loopterm")
def main(): env = env_fn() print(env.observation_space) obs_size, = env.observation_space.shape act_size = env.action_space.n device = "cpu" policy = FCPolicy(obs_size, act_size, 64, device) data_store_size = 12800 batch_size = 16 logger = make_logger("log") run_loop( logger, DQNLearner(policy, 0.001, 0.99, logger, device), OccasionalUpdate(10, policy), StatelessActor(policy), env_fn, ConcatVecEnv, lambda: TransitionAdder(env.observation_space, env.action_space), DensitySampleScheme(data_store_size), data_store_size, batch_size, n_envs=16, log_frequency=5 )
def main(): env = env_fn() cpu_count = mp.cpu_count() # cpu_count = 0 num_envs = 8 num_cpus = 4 num_targets = 1 model_features = 512 data_store_size = 500000 batch_size = 512 max_grad_norm = 0.1 num_actions = env.action_space.n device = "cuda" num_actors = 1 max_learn_steps = 100000 # venv = MakeCPUAsyncConstructor(cpu_count)([env_fn]*num_envs, env.observation_space, env.action_space) # venv.reset() def model_fn(): return FlatModel(env.observation_space.shape[0]) save_folder = "savedata/" def policy_fn_dev(device): policy = DiversityPolicy(model_fn, model_features, num_actions, num_targets, obs_preproc, device) load_latest(save_folder, policy) return policy policy_fn = lambda: policy_fn_dev(device) priority_updater = NoUpdater() logger = make_logger("log") run_loop( logger, lambda: DiversityLearner(discount_factor=0.99, obs_preproc=obs_preproc, model_fn=model_fn, max_learn_steps=max_learn_steps, model_features=model_features, logger=logger, device=device, num_targets=num_targets, num_actions=num_actions), OccasionalUpdate(10, lambda: policy_fn_dev("cpu")), lambda: TargetUpdaterActor(policy_fn(), num_envs // num_actors, num_targets, target_staggering=1.314), env_fn, Saver(save_folder), # MakeCPUAsyncConstructor(n_cpus), lambda: TargetTransitionAdder(env.observation_space, env.action_space, num_targets), UniformSampleScheme(data_store_size), data_store_size, batch_size, num_cpus=num_cpus, num_env_ids=num_envs, priority_updater=priority_updater, log_frequency=5, max_learn_steps=max_learn_steps, act_steps_until_learn=10000, # num_actors=num_actors, )
def main(): def env_contr(): return gym.make("CartPole-v0") # # env = multiwalker_v0.env() # env = pad_observations(env) # env = pad_action_space(env) # markov_env = aec_to_markov(env) # venv = MarkovVectorEnv(markov_env) # return venv n_envs = 6 # def nest_env_const(): # cat = ConcatVecEnv([env_contr]*envs_per_proc) # return cat example_env = env_contr() num_envs = n_envs * 1 #example_env.num_envs #cat = ProcConcatVec([nest_env_const]*n_procs,example_env.observation_space, example_env.action_space, num_envs) cat = MakeCPUAsyncConstructor(0)([env_contr] * n_envs, example_env.observation_space, example_env.action_space) #, num_envs) cat = VecEnvWrapper(cat) env = cat policy = "MlpPolicy" logger = make_logger("log") stable_baselines3.common.logger.Logger.CURRENT = logger a2c = PPO(policy, cat, n_steps=4, batch_size=6, n_epochs=3) print(type(a2c.env)) #a2c.learn(1000000) total_timesteps, callback = a2c._setup_learn(10000, None, None, None, n_eval_episodes=5, reset_num_timesteps=None, tb_log_name="PPo") #total_timesteps = 100 iteration = 0 log_interval = 1 for i in range(total_timesteps): continue_training = a2c.collect_rollouts(env, callback, a2c.rollout_buffer, n_rollout_steps=a2c.n_steps) print(a2c.ep_info_buffer) if continue_training is False: break iteration += 1 a2c._update_current_progress_remaining(a2c.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(a2c.num_timesteps / (time.time() - a2c.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") print(a2c.ep_info_buffer) if len(a2c.ep_info_buffer) > 0 and len(a2c.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in a2c.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in a2c.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - a2c.start_time), exclude="tensorboard") logger.record("time/total_timesteps", a2c.num_timesteps, exclude="tensorboard") logger.dump(step=a2c.num_timesteps) a2c.train()