예제 #1
0
def get_apex_trainer(strategy):
    config = APEX_DEFAULT_CONFIG.copy()
    config["env"] = MountainCar
    config["buffer_size"] = 1000000
    config["learning_starts"] = 10000
    config["target_network_update_freq"] = 50000
    config["rollout_fragment_length"] = 200
    config["timesteps_per_iteration"] = 10000
    config["num_gpus"] = 1
    config["num_workers"] = 20
    config["evaluation_num_workers"] = 10
    config["evaluation_interval"] = 1
    if strategy not in [
        "with_dueling",
        "custom_reward_n_dueling",
        "curriculum_n_dueling",
    ]:
        config["hiddens"] = []
        config["dueling"] = False

    if strategy == "action_masking":
        ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
        config["env_config"] = {"use_action_masking": True}
        config["model"] = {
            "custom_model": "pa_model",
        }
    elif strategy == "custom_reward" or strategy == "custom_reward_n_dueling":
        config["env_config"] = {"reward_fun": "custom_reward"}
    elif strategy in ["curriculum", "curriculum_n_dueling"]:
        config["env_config"] = {"lesson": 0}
    elif strategy == "demonstration":
        config["input"] = DEMO_DATA_DIR
        #config["input"] = {"sampler": 0.7, DEMO_DATA_DIR: 0.3}
        config["explore"] = False
        config["input_evaluation"] = []
        config["n_step"] = 1

    trainer = ApexTrainer(config=config)
    return trainer, config["env_config"]
예제 #2
0
config['model']["conv_filters"] = filters_84x84


config['min_iter_time_s'] = 5
config['n_step'] = 2
config['target_network_update_freq'] = 0
config['timesteps_per_iteration'] = 50000
config['train_batch_size'] = 128
config['lr'] = 0.0050

# === Evaluation ===
config['evaluation_interval'] = 50
config['evaluation_num_episodes'] = 5


agent = ApexTrainer(config, "TetrisA-v2")

reward = -999
epoch = 0

# This has a memory leak. After around 25 iterations it consumes all its object store memory
# Then it explodes after being unable to put any more items into the object store.

while reward < 200:
    result = agent.train()
    print(f'=========== RESULT {epoch} =================')
    result = dict(result)
    print(result)

    reward = result['episode_reward_mean']
    if np.isnan(reward):
예제 #3
0
def main(argv):
    ModelCatalog.register_custom_model("my_model", MyModelClass)

    model = {
        # cusom model options
        "custom_model": "my_model",
        "custom_preprocessor": None,
        # Extra options to pass to the custom classes
        "custom_options": {},

        # built in options
        # Number of hidden layers for fully connected net
        "fcnet_hiddens": [256, 256, 256, 256],
    }

    num_workers = 2

    # read out command line arguments
    try:
        opts, args = getopt.getopt(argv, "hn:", ["number-worker="])
    except getopt.GetoptError:
        print('ray_server.py -n <number-worker>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('ray_server.py -n <number-worker>')
            print('-n --number-worker  - number of worker to start')
            sys.exit()
        elif opt in ("-n", "--number-worker"):
            num_workers = int(arg)

    ray.init()
    print("[RAY] Initialized")
    register_env("srv", lambda _: CartpoleServing())

    if ALGORITHM == "APEX":
        dqn = ApexTrainer(
            env="srv",
            config={
                # model
                "model": model,
                "gamma": 0.99,
                "noisy": False,
                "num_gpus": 1,

                # evaluation
                # everything default, see dqn.py

                #exploration
                "target_network_update_freq": 500000,
                # rest: everything default, see dqn.py

                #replay buffer
                # Size of the replay buffer. Note that if async_updates is set, then
                # each worker will have a replay buffer of this size. default 50000
                "buffer_size": 2000000,
                # If True prioritized replay buffer will be used.
                "prioritized_replay": True,
                # here are many parameters, untouched from me (see dqn.py)

                # Optimization
                # Learning rate - defaults to 5e-4
                "lr": 0.0001,
                # Size of rollout batch
                # Default sample batch size (unroll length). Batches of this size are
                # collected from workers until train_batch_size is met. When using
                # multiple envs per worker, this is multiplied by num_envs_per_worker.
                "sample_batch_size": 4,
                # Training batch size, if applicable. Should be >= sample_batch_size.
                # Samples batches will be concatenated together to this size for training.
                "train_batch_size": 64,
                # How many steps of the model to sample before learning starts
                "learning_starts": 50000,

                #parallelism
                "num_workers": num_workers,
                # distribute epsilon over workers (default for apex)
                "per_worker_exploration": True,
                # determine per worker which experience should be prioritized, before giving those to the
                # shared experience memory
                "worker_side_prioritization": True,

                # "schedule_max_timesteps": 100000, # was tut es?
                # "timesteps_per_iteration": 25000, # was tut es?
                # "min_iter_time_s": 30, # was tut es?
            })
    else:
        dqn = DQNTrainer(
            env="srv",
            config={
                # model
                # mehrere Threads fuer worker! fuer debugging auf false setzen
                # "sample_async": True,
                # "grad_clip": 0.5,
                "model": model,
                "gamma": 0.99,
                "noisy": False,
                "num_gpus": 1,

                # Whether to use dueling dqn
                "dueling": False,
                # Whether to use double dqn
                "double_q": False,

                # evaluation
                # everything default, see dqn.py

                # exploration
                "target_network_update_freq": 500000,
                # rest: everything default, see dqn.py

                # replay buffer
                # Size of the replay buffer. Note that if async_updates is set, then
                # each worker will have a replay buffer of this size. default 50000
                "buffer_size": 2000000,
                # If True prioritized replay buffer will be used.
                "prioritized_replay": False,
                # here are many parameters, untouched from me (see dqn.py)

                # Optimization
                # Learning rate - defaults to 5e-4
                "lr": 0.0001,
                # Update the replay buffer with this many samples at once. Note that
                # this setting applies per-worker if num_workers > 1.
                #"sample_batch_size": 1024,
                # How many steps of the model to sample before learning starts
                "learning_starts": 50000,
                # Size of a batched sampled from replay buffer for training. Note that
                # if async_updates is set, then each worker returns gradients for a
                # batch of this size. (Minibatch size) hould be >= sample_batch_size
                # Samples batches will be concatenated together to this size for training.
                "train_batch_size": 2048,

                # parallelism
                # Number of workers for collecting samples with. This only makes sense
                # to increase if your environment is particularly slow to sample, or if
                # you"re using the Async or Ape-X optimizers.
                "num_workers": num_workers,
                # distribute epsilon over workers
                "per_worker_exploration": True,
                # compute worker side prioritazation (False, because in DQN this was not ipmlemented)
                "worker_side_prioritization": False,
            })

    # write policy graph to tensorboard (for debugging purposes)
    policy_graph = dqn.local_evaluator.policy_map["default_policy"].sess.graph
    writer = tf.summary.FileWriter(dqn._result_logger.logdir, policy_graph)
    writer.close()

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)
예제 #4
0
파일: apex.py 프로젝트: tchordia/ray
 def execution_plan(
     workers: WorkerSet, config: dict, **kwargs
 ) -> LocalIterator[dict]:
     """Use APEX-DQN's execution plan."""
     return ApexTrainer.execution_plan(workers, config, **kwargs)
예제 #5
0
파일: apex.py 프로젝트: tchordia/ray
 def training_iteration(self) -> ResultDict:
     """Use APEX-DQN's training iteration function."""
     return ApexTrainer.training_iteration(self)
예제 #6
0
파일: apex.py 프로젝트: tchordia/ray
 def setup(self, config: PartialTrainerConfigDict):
     return ApexTrainer.setup(self, config)