示例#1
0
def main():
    env = env_fn()
    print(env.observation_space)
    obs_size, = env.observation_space.shape
    act_size = env.action_space.n
    device = "cuda"
    policy_fn = lambda device: lambda: FCPolicy(obs_size, act_size, 512, device
                                                )
    data_store_size = 12800
    batch_size = 64
    n_envs = 8
    n_cpus = 0
    logger = make_logger("log")
    save_folder = "basic_test_save"

    run_loop(
        logger,
        lambda: DQNLearner(policy_fn("cuda"), 0.001, 0.99, logger, device),
        OccasionalUpdate(10, policy_fn("cpu")),
        lambda: StatelessActor(policy_fn("cuda")()),
        env_fn,
        Saver(save_folder),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        num_env_ids=n_envs,
        log_frequency=5,
        num_cpus=n_cpus,
        act_steps_until_learn=8000)
示例#2
0
def main():
    env = env_fn()
    cpu_count = mp.cpu_count()
    # cpu_count = 0
    num_envs = 8
    num_cpus = 0
    num_targets = 1
    model_features = 512
    data_store_size = 10000
    batch_size = 512
    max_grad_norm = 0.1
    device = "cuda"
    num_actors = 1
    max_learn_steps = 100000

    save_folder = "savedata/"

    def policy_fn_dev(device):
        policy = SACPolicy(env.observation_space, env.action_space, device)
        # load_latest(save_folder, policy)
        return policy

    priority_updater = NoUpdater()
    logger = make_logger("log")
    run_loop(
        logger,
        lambda: SACLearner(policy_fn_dev(device),
                           gamma=0.99,
                           T_max=max_learn_steps,
                           logger=logger,
                           device=device),
        OccasionalUpdate(100, lambda: policy_fn_dev("cpu")),
        lambda: StatelessActor(policy_fn_dev(device)),
        env_fn,
        Saver(save_folder),
        # MakeCPUAsyncConstructor(n_cpus),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        num_cpus=num_cpus,
        num_env_ids=num_envs,
        priority_updater=priority_updater,
        log_frequency=5,
        max_learn_steps=max_learn_steps,
        # act_steps_until_learn=10000,
        # num_actors=num_actors,
    )
示例#3
0
def main():
    n_envs = 8
    env_id = "CartPole-v0"
    # def env_fn():
    #     return continuous_actions(gym.make(env_id))
    env = env_fn()
    #print(env.observation_space)
    #obs_size, = env.observation_space.shape
    #act_size = env.action_space.n

    sb3_env = SpaceWrap(env)

    # print(sb3_env.action_space)
    # exit(0)
    n_timesteps = 1000
    save_path = "log"
    eval_freq = 50

    tensorboard_log = ""
    sb3_learner_fn = lambda device: TD3(env=sb3_env,
                                        tensorboard_log=tensorboard_log,
                                        policy=MlpPolicy,
                                        device=device)
    learner_fn = lambda: SB3LearnWrapper(sb3_learner_fn("cuda"))

    policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cuda").policy)
    example_policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cpu").policy)
    #learner = (model)
    learn_rate = lambda x: 0.01
    #policy = SB3Wrapper(model.policy)#MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu"))
    data_store_size = 12800
    batch_size = 512
    logger = make_logger("log")
    run_loop(
        logger,
        learner_fn,  #A2CLearner(policy, 0.001, 0.99, logger, device),
        OccasionalUpdate(10, example_policy_fn()),
        lambda: StatelessActor(policy_fn()),
        env_fn,
        MakeCPUAsyncConstructor(4),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        n_envs=16,
        log_frequency=5)
示例#4
0
def main():
    env = env_fn()
    cpu_count = mp.cpu_count()
    # cpu_count = 0
    num_envs = 1
    num_cpus = 0
    num_targets = 1
    data_store_size = 10000
    batch_size = 64
    device="cpu"
    num_actors = 1
    max_learn_steps = 40000

    save_folder = "savedata/"
    def policy_fn_dev(device):
        policy = DQNPolicy(env, logger, device)
        # load_latest(save_folder, policy)
        return policy

    priority_updater = NoUpdater()
    logger = make_logger("log")
    policy = policy_fn_dev(device)
    run_loop(
        logger,
        lambda: DQNLearner(policy, logger, env.action_space.n, device=device),
        NoUpdate(),
        # OccasionalUpdate(100, lambda: policy_fn_dev("cpu")),
        lambda: StatelessActor(policy),
        env_fn,
        Saver(save_folder),
        # MakeCPUAsyncConstructor(n_cpus),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        act_steps_until_learn=1000,
        steps_per_update=1,
        num_cpus=num_cpus,
        num_env_ids=num_envs,
        priority_updater=priority_updater,
        log_frequency=5,
        max_learn_steps=max_learn_steps,
        # act_steps_until_learn=10000,
        # num_actors=num_actors,
    )
示例#5
0
def main():
    n_envs = 8
    env_id = "CartPole-v0"
    # def env_fn():
    #     return continuous_actions(gym.make(env_id))
    env = env_fn()
    #print(env.observation_space)
    #obs_size, = env.observation_space.shape
    #act_size = env.action_space.n

    sb3_env = SpaceWrap(env)

    # print(sb3_env.action_space)
    # exit(0)
    n_timesteps = 1000
    save_path = "log"
    eval_freq = 50

    tensorboard_log = ""

    model = TD3(env=sb3_env, tensorboard_log=tensorboard_log, policy=MlpPolicy)
    learner = SB3LearnWrapper(model)
    device = "cpu"
    learn_rate = lambda x: 0.01
    policy = SB3Wrapper(
        model.policy
    )  #MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu"))
    data_store_size = 12800
    batch_size = 16
    logger = make_logger("log")
    run_loop(
        logger,
        lambda: learner,  #A2CLearner(policy, 0.001, 0.99, logger, device),
        NoUpdate(),  #.10, policy),
        lambda: StatelessActor(policy),
        env_fn,
        ConcatVecEnv,
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        n_envs=16,
        log_frequency=5)
示例#6
0
def main():
    env = env_fn()
    print(env.observation_space)
    obs_size, = env.observation_space.shape
    act_size = env.action_space.n
    device = "cuda"
    policy_fn = lambda: FCPolicy(obs_size, act_size, 64, device)
    data_store_size = 128000
    batch_size = 64
    logger = make_logger("log")
    run_loop(logger,
             lambda: DQNLearner(policy_fn(), 0.001, 0.99, logger, device),
             OccasionalUpdate(10, FCPolicy(obs_size, act_size, 64, "cpu")),
             lambda: StatelessActor(policy_fn()),
             env_fn,
             SyncVectorEnv,
             lambda: TransitionAdder(env.observation_space, env.action_space),
             DensitySampleScheme(data_store_size),
             data_store_size,
             batch_size,
             n_envs=32,
             log_frequency=5)
示例#7
0
def main():

    save_folder = "savedata/"
    def policy_fn_dev(device,is_learner=False):
        device = torch.device(device)
        policy = Agent(device, args, env,logger,priority_updater,is_learner=is_learner)
        load_latest(save_folder, policy)
        return policy
    data_store_size = 500000
    batch_size = 256
    args.batch_size = batch_size
    n_envs = 32
    n_cpus = 32
    priority_updater = PriorityUpdater()
    logger = make_logger("log")
    print("cpu create")

    print("cpu finish create")
    run_loop(
        logger,
        lambda: policy_fn_dev("cuda:0",is_learner=True),#DDPGLearner(policy_fn, reward_normalizer_fn, 0.001, 0.99, 0.1, logger, priority_updater, device),
        OccasionalUpdate(100, lambda: policy_fn_dev("cpu")),
        lambda: StatelessActor(policy_fn_dev("cuda:0")),
        env_fn,
        Saver(save_folder),
        #MakeCPUAsyncConstructor(n_cpus),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),#, alpha=0.5, beta_fn=lambda x:0.),
        data_store_size,
        batch_size,
        act_steps_until_learn=200000,
        num_env_ids=n_envs,
        num_cpus=n_cpus,
        priority_updater=priority_updater,
        log_frequency=5.,
        max_learn_steps=10000000,
    )
    print("loopterm")
示例#8
0
def main():
    env = env_fn()
    print(env.observation_space)
    obs_size, = env.observation_space.shape
    act_size = env.action_space.n
    device = "cpu"
    policy = FCPolicy(obs_size, act_size, 64, device)
    data_store_size = 12800
    batch_size = 16
    logger = make_logger("log")
    run_loop(
        logger,
        DQNLearner(policy, 0.001, 0.99, logger, device),
        OccasionalUpdate(10, policy),
        StatelessActor(policy),
        env_fn,
        ConcatVecEnv,
        lambda: TransitionAdder(env.observation_space, env.action_space),
        DensitySampleScheme(data_store_size),
        data_store_size,
        batch_size,
        n_envs=16,
        log_frequency=5
    )
示例#9
0
def main():
    env = env_fn()
    cpu_count = mp.cpu_count()
    # cpu_count = 0
    num_envs = 8
    num_cpus = 4
    num_targets = 1
    model_features = 512
    data_store_size = 500000
    batch_size = 512
    max_grad_norm = 0.1
    num_actions = env.action_space.n
    device = "cuda"
    num_actors = 1
    max_learn_steps = 100000

    # venv = MakeCPUAsyncConstructor(cpu_count)([env_fn]*num_envs, env.observation_space, env.action_space)
    # venv.reset()
    def model_fn():
        return FlatModel(env.observation_space.shape[0])

    save_folder = "savedata/"

    def policy_fn_dev(device):
        policy = DiversityPolicy(model_fn, model_features, num_actions,
                                 num_targets, obs_preproc, device)
        load_latest(save_folder, policy)
        return policy

    policy_fn = lambda: policy_fn_dev(device)
    priority_updater = NoUpdater()
    logger = make_logger("log")
    run_loop(
        logger,
        lambda: DiversityLearner(discount_factor=0.99,
                                 obs_preproc=obs_preproc,
                                 model_fn=model_fn,
                                 max_learn_steps=max_learn_steps,
                                 model_features=model_features,
                                 logger=logger,
                                 device=device,
                                 num_targets=num_targets,
                                 num_actions=num_actions),
        OccasionalUpdate(10, lambda: policy_fn_dev("cpu")),
        lambda: TargetUpdaterActor(policy_fn(),
                                   num_envs // num_actors,
                                   num_targets,
                                   target_staggering=1.314),
        env_fn,
        Saver(save_folder),
        # MakeCPUAsyncConstructor(n_cpus),
        lambda: TargetTransitionAdder(env.observation_space, env.action_space,
                                      num_targets),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        num_cpus=num_cpus,
        num_env_ids=num_envs,
        priority_updater=priority_updater,
        log_frequency=5,
        max_learn_steps=max_learn_steps,
        act_steps_until_learn=10000,
        # num_actors=num_actors,
    )
示例#10
0
def main():
    def env_contr():
        return gym.make("CartPole-v0")  #
        # env = multiwalker_v0.env()
        # env = pad_observations(env)
        # env = pad_action_space(env)
        # markov_env = aec_to_markov(env)
        # venv = MarkovVectorEnv(markov_env)
        # return venv

    n_envs = 6
    # def nest_env_const():
    #     cat = ConcatVecEnv([env_contr]*envs_per_proc)
    #     return cat
    example_env = env_contr()
    num_envs = n_envs * 1  #example_env.num_envs
    #cat = ProcConcatVec([nest_env_const]*n_procs,example_env.observation_space, example_env.action_space, num_envs)
    cat = MakeCPUAsyncConstructor(0)([env_contr] * n_envs,
                                     example_env.observation_space,
                                     example_env.action_space)  #, num_envs)
    cat = VecEnvWrapper(cat)
    env = cat
    policy = "MlpPolicy"
    logger = make_logger("log")
    stable_baselines3.common.logger.Logger.CURRENT = logger
    a2c = PPO(policy, cat, n_steps=4, batch_size=6, n_epochs=3)
    print(type(a2c.env))
    #a2c.learn(1000000)

    total_timesteps, callback = a2c._setup_learn(10000,
                                                 None,
                                                 None,
                                                 None,
                                                 n_eval_episodes=5,
                                                 reset_num_timesteps=None,
                                                 tb_log_name="PPo")

    #total_timesteps = 100
    iteration = 0
    log_interval = 1
    for i in range(total_timesteps):
        continue_training = a2c.collect_rollouts(env,
                                                 callback,
                                                 a2c.rollout_buffer,
                                                 n_rollout_steps=a2c.n_steps)
        print(a2c.ep_info_buffer)
        if continue_training is False:
            break

        iteration += 1
        a2c._update_current_progress_remaining(a2c.num_timesteps,
                                               total_timesteps)

        # Display training infos
        if log_interval is not None and iteration % log_interval == 0:
            fps = int(a2c.num_timesteps / (time.time() - a2c.start_time))
            logger.record("time/iterations", iteration, exclude="tensorboard")
            print(a2c.ep_info_buffer)
            if len(a2c.ep_info_buffer) > 0 and len(a2c.ep_info_buffer[0]) > 0:
                logger.record(
                    "rollout/ep_rew_mean",
                    safe_mean([ep_info["r"]
                               for ep_info in a2c.ep_info_buffer]))
                logger.record(
                    "rollout/ep_len_mean",
                    safe_mean([ep_info["l"]
                               for ep_info in a2c.ep_info_buffer]))
            logger.record("time/fps", fps)
            logger.record("time/time_elapsed",
                          int(time.time() - a2c.start_time),
                          exclude="tensorboard")
            logger.record("time/total_timesteps",
                          a2c.num_timesteps,
                          exclude="tensorboard")
            logger.dump(step=a2c.num_timesteps)

        a2c.train()