Пример #1
0
def run_experiment(config, trainable):
  """
  Run a single tune experiment in parallel as a "remote" function.

  :param config: The experiment configuration
  :type config: dict
  :param trainable: tune.Trainable class with your experiment
  :type trainable: :class:`ray.tune.Trainable`
  """
  # Stop criteria. Default to total number of iterations/epochs
  stop_criteria = {
    "training_iteration": config.get("iterations")
  }
  stop_criteria.update(config.get("stop", {}))

  tune.run(
    trainable,
    name=config["name"],
    local_dir=config["path"],
    stop=stop_criteria,
    config=config,
    num_samples=config.get("repetitions", 1),
    search_alg=config.get("search_alg", None),
    scheduler=config.get("scheduler",
                         AsyncHyperBandScheduler(
                           reward_attr='mean_accuracy',
                           time_attr="training_iteration",
                           brackets = 2,
                           grace_period=max(1, int(config.get("iterations", 10)/10)),
                           reduction_factor=3,
                           max_t=config.get("iterations", 10)
                         )),
    trial_name_creator=tune.function(trial_name_string),
    trial_executor=config.get("trial_executor", None),
    checkpoint_at_end=config.get("checkpoint_at_end", False),
    checkpoint_freq=config.get("checkpoint_freq", 0),
    upload_dir=config.get("upload_dir", None),
    sync_function=config.get("sync_function", None),
    resume=config.get("resume", False),
    reuse_actors=config.get("reuse_actors", False),
    verbose=config.get("verbose", 0),
    resources_per_trial={
      "cpu": config.get("cpu_percentage", 1.0), 
      "gpu": config.get("gpu_percentage", 1.0),
    },
    # # added parameters to allow monitoring through REST API
    # with_server=True, 
    # server_port=4321,  
  )
Пример #2
0
def run_experiment(config, trainable):
  """
  Run a single tune experiment in parallel as a "remote" function.

  :param config: The experiment configuration
  :type config: dict
  :param trainable: tune.Trainable class with your experiment
  :type trainable: :class:`ray.tune.Trainable`
  """
  # Stop criteria. Default to total number of iterations/epochs
  stop_criteria = {
    "training_iteration": config.get("iterations")
  }
  stop_criteria.update(config.get("stop", {}))

  tune.run(
    trainable,
    name=config["name"],
    local_dir=config["path"],
    stop=stop_criteria,
    config=config,
    num_samples=config.get("repetitions", 1),
    search_alg=config.get("search_alg", None),
    scheduler=config.get("scheduler",
                         MedianStoppingRule(
                           time_attr="training_iteration",
                           reward_attr='noise_accuracy',
                           min_samples_required=3,
                           grace_period=20,
                           verbose=False,
                         )),
    trial_name_creator=tune.function(trial_name_string),
    trial_executor=config.get("trial_executor", None),
    checkpoint_at_end=config.get("checkpoint_at_end", False),
    checkpoint_freq=config.get("checkpoint_freq", 0),
    upload_dir=config.get("upload_dir", None),
    sync_function=config.get("sync_function", None),
    resume=config.get("resume", False),
    reuse_actors=config.get("reuse_actors", False),
    verbose=config.get("verbose", 0),
    resources_per_trial={
      # With lots of trials, optimal seems to be 0.5, or 2 trials per GPU
      # If num trials <= num GPUs, 1.0 is better
      "cpu": 1, "gpu": config.get("gpu_percentage", 0.5),
    }
  )
Пример #3
0
    def testGetTrialsWithFunction(self):
        runner, client = self.basicSetup()
        test_trial = Trial(
            "__fake",
            trial_id="function_trial",
            stopping_criterion={"training_iteration": 3},
            config={
                "callbacks": {
                    "on_episode_start": tune.function(lambda x: None)
                }
            })
        runner.add_trial(test_trial)

        for i in range(3):
            runner.step()
        all_trials = client.get_all_trials()["trials"]
        self.assertEqual(len(all_trials), 3)
        client.get_trial("function_trial")
        runner.step()
        self.assertEqual(len(all_trials), 3)
Пример #4
0
        path = os.path.join(checkpoint_dir, "checkpoint")
        with open(path, "w") as f:
            f.write(json.dumps({"timestep": self.timestep}))
        return path

    def _restore(self, checkpoint_path):
        with open(checkpoint_path) as f:
            self.timestep = json.loads(f.read())["timestep"]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    args, _ = parser.parse_known_args()
    ray.init()
    exp = Experiment(
        name="hyperband_test",
        run=MyTrainableClass,
        num_samples=1,
        trial_name_creator=tune.function(trial_str_creator),
        loggers=[TestLogger],
        stop={"training_iteration": 1 if args.smoke_test else 99999},
        config={
            "width": tune.sample_from(
                lambda spec: 10 + int(90 * random.random())),
            "height": tune.sample_from(lambda spec: int(100 * random.random()))
        })

    trials = run_experiments(exp)
Пример #5
0
                "num_gpus": 0.2 if args.gpu else 0,
                "num_workers": args.num_workers,
                "sgd_minibatch_size": 100 if args.fast else 1000,
                "sample_batch_size": 200 if args.fast else 5000,
                "train_batch_size": 1000 if args.fast else 15000,
                "batch_mode": "complete_episodes",
                "observation_filter": "NoFilter",
                "num_envs_per_worker": 8,
                "model": {
                    "custom_model": "mask",
                    "fcnet_hiddens": [512, 512],
                },
                "vf_share_layers": True,
                "entropy_coeff": 0.01,
                "callbacks": {
                    "on_episode_end": tune.function(on_episode_end),
                },
                "env_config": {
                    "zero_obs": False,
                    "dump_dir": args.dump_dir,
                    "partition_mode": args.partition_mode,
                    "reward_shape": args.reward_shape,
                    "max_depth": 100 if args.fast else 500,
                    "max_actions": 1000 if args.fast else 15000,
                    "depth_weight": args.depth_weight,
                    "rules": grid_search(args.rules),
                },
            },
        },
    })
Пример #6
0
    def _save(self, checkpoint_dir):
        path = os.path.join(checkpoint_dir, "checkpoint")
        with open(path, "w") as f:
            f.write(json.dumps({"timestep": self.timestep}))
        return path

    def _restore(self, checkpoint_path):
        with open(checkpoint_path) as f:
            self.timestep = json.loads(f.read())["timestep"]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--smoke-test",
                        action="store_true",
                        help="Finish quickly for testing")
    args, _ = parser.parse_known_args()

    trials = run(
        MyTrainableClass,
        name="hyperband_test",
        num_samples=5,
        trial_name_creator=tune.function(trial_str_creator),
        loggers=[TestLogger],
        stop={"training_iteration": 1 if args.smoke_test else 99999},
        config={
            "width":
            tune.sample_from(lambda spec: 10 + int(90 * random.random())),
            "height": tune.sample_from(lambda spec: int(100 * random.random()))
        })
Пример #7
0

eval_config = {
    "evaluation_interval": 1,  # I think this means every x training_iterations
    "evaluation_config": {
        "explore": False,
        "exploration_fraction": 0,
        "exploration_final_eps": 0,
        "evaluation_num_episodes": 10,
        "horizon": 100,
        "env_config": {
            "dummy_eval": True,  # hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats
            "transition_noise": 0
            if "state_space_type" in env_config["env_config"]
            and env_config["env_config"]["state_space_type"] == "discrete"
            else tune.function(lambda a: a.normal(0, 0)),
            "reward_noise": tune.function(lambda a: a.normal(0, 0)),
            "action_loss_weight": 0.0,
        },
    },
}
value_tuples = []
for config_type, config_dict in var_configs.items():
    for key in config_dict:
        assert (
            isinstance(var_configs[config_type][key], list)
        ), "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values"
        value_tuples.append(var_configs[config_type][key])


cartesian_product_configs = list(itertools.product(*value_tuples))
Пример #8
0
                                        },
                                    "model": {
                                        "fcnet_hiddens": [layer_width for i in range(num_layers)],
                                        # "custom_preprocessor": "ohe",
                                        "custom_options": {},  # extra options to pass to your preprocessor
                                        "fcnet_activation": "tanh",
                                        "use_lstm": False,
                                        "max_seq_len": 20,
                                        "lstm_cell_size": 256,
                                        "lstm_use_prev_action_reward": False,
                                        },

                                              "callbacks": {
                                #                 "on_episode_start": tune.function(on_episode_start),
                                #                 "on_episode_step": tune.function(on_episode_step),
                                                "on_episode_end": tune.function(on_episode_end),
                                #                 "on_sample_end": tune.function(on_sample_end),
                                                "on_train_result": tune.function(on_train_result),
                                #                 "on_postprocess_traj": tune.function(on_postprocess_traj),
                                            },
                                        "evaluation_interval": 1, # I think this every x training_iterations
                                        "evaluation_config": {
                                        #'seed': 0, #seed
                                        "exploration_fraction": 0,
                                        "exploration_final_eps": 0,
                                        "batch_mode": "complete_episodes",
                                        'horizon': 100,
                                          "env_config": {
                                            "dummy_eval": True, #hack
                                            }
                                    },
def setup_exps_rllib(flow_params,
                     n_cpus,
                     n_rollouts,
                     policy_graphs=None,
                     policy_mapping_fn=None,
                     policies_to_train=None,
                     flags=None):
    from ray import tune
    from ray.tune.registry import register_env
    try:
        from ray.rllib.agents.agent import get_agent_class
    except ImportError:
        from ray.rllib.agents.registry import get_agent_class
    import torch
    horizon = flow_params['env'].horizon
    if flags.algorithm.lower() == "ppo":
        alg_run = "PPO"
        agent_cls = get_agent_class(alg_run)
        config = deepcopy(agent_cls._default_config)
        config["num_workers"] = n_cpus
        config["horizon"] = horizon

        config['sgd_minibatch_size'] = 64
        config["clip_param"] = 0.2
        #Exploration
        config['exploration_config']["type"] = "GaussianNoise"
        config['exploration_config']["initial_scale"] = 1.0
        config['exploration_config']["final_scale"] = 0.02
        config['exploration_config']["scale_timesteps"] = 1000000
        config['exploration_config']["random_timesteps"] = 1000
        config['exploration_config']["stddev"] = 0.1

    #common config
    config['framework'] = 'torch'
    config['callbacks'] = {
        "on_episode_end": None,
        "on_episode_start": None,
        "on_episode_step": None,
        "on_postprocess_traj": None,
        "on_sample_end": None,
        "on_train_result": None
    }
    # config["opt_type"]= "adam" for impala and APPO, default is SGD
    # TrainOneStep class call SGD -->execution_plan function can have policy update function
    print("cuda is available: ", torch.cuda.is_available())
    print('Beginning training.')
    print("==========================================")
    print("running algorithm: ", alg_run)  # "Framework: ", "torch"

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    # multiagent configuration
    if policy_graphs is not None:
        print("policy_graphs", policy_graphs)
        config['multiagent'].update({'policies': policy_graphs})
    if policy_mapping_fn is not None:
        config['multiagent'].update(
            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
    if policies_to_train is not None:
        config['multiagent'].update({'policies_to_train': policies_to_train})

    create_env, gym_name = make_create_env(params=flow_params)

    # Register as rllib env
    register_env(gym_name, create_env)
    return alg_run, gym_name, config
Пример #10
0
            },
            "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]),
        }
        return (PPOPolicyGraph, obs_space, act_space, config)

    # Setup PPO with an ensemble of `num_policies` different policy graphs
    policy_graphs = {
        "policy_{}".format(i): gen_policy(i)
        for i in range(args.num_policies)
    }
    policy_ids = list(policy_graphs.keys())

    run_experiments({
        "test": {
            "run": "PPO",
            "env": "multi_cartpole",
            "stop": {
                "training_iteration": args.num_iters
            },
            "config": {
                "simple_optimizer": True,
                "multiagent": {
                    "policy_graphs":
                    policy_graphs,
                    "policy_mapping_fn":
                    tune.function(lambda agent_id: random.choice(policy_ids)),
                },
            },
        }
    })
Пример #11
0
        if iter % 500 == 0:
            trainer.save("saved_models/multi-carla/" + args.model_arch)
        pprint(results)
else:
    config = {
        "env": "dm-" + env_name,
        "log_level": "DEBUG",
        "multiagent": {
            "policy_graphs": {
                "def_policy":
                (VTracePolicyGraph, Box(0.0, 255.0,
                                        shape=(84, 84, 3)), Discrete(9), {
                                            "gamma": 0.99
                                        })
            },
            "policy_mapping_fn": tune.function(lambda agent_id: "def_policy"),
        },
        "env_config": env_actor_configs,
        "num_workers": args.num_workers,
        "num_envs_per_worker": args.envs_per_worker,
        "sample_batch_size": args.sample_bs_per_worker,
        "train_batch_size": args.train_bs
    }

    experiment_spec = tune.Experiment(
        "multi-carla/" + args.model_arch,
        "IMPALA",
        # timesteps_total is init with None (not 0) which causes issue
        # stop={"timesteps_total": args.num_steps},
        stop={"timesteps_since_restore": args.num_steps},
        config=config,
Пример #12
0
def main():

    # for the user defined module in code_dir, need to be imported in functions
    # sys.path.insert(0, code_dir)
    # import parameters
    # import basic_src.io_function as io_function
    # import workflow.whole_procedure as whole_procedure
    # from utility.eva_report_to_tables import read_accuracy_multi_reports

    loc_dir = "./ray_results"
    # tune_name = "tune_traning_para_tesia"
    # tune_name = "tune_backbone_para_tesia"
    # tune_name = "tune_backbone_largeBatchS_tesia"
    tune_name = "tune_backbone_para_tesia_v2"
    file_folders = io_function.get_file_list_by_pattern(
        os.path.join(loc_dir, tune_name), '*')
    # if len(file_folders) > 1:
    #     b_resume = True
    # else:
    #     b_resume = False

    # try to resume after when through all (some failed), they always complain:
    # "Trials did not complete", incomplete_trials, so don't resume.
    file_folders = io_function.get_file_list_by_pattern(
        os.path.join(loc_dir, tune_name), '*')
    if len(file_folders) > 1:
        b_resume = True
    else:
        b_resume = False
    # max_failures = 2,
    # stop = tune.function(stop_function),

    analysis = tune.run(
        training_function,
        # set gpu as 2 (can divide batch size), cpu 24, making it one run one trial each time.
        resources_per_trial={
            "gpu": 2,
            "cpu": 24
        },  # use three GPUs, 12 CPUs on tesia  # "cpu": 14, don't limit cpu, eval.py will not use all
        local_dir=loc_dir,
        name=tune_name,
        # fail_fast=True,     # Stopping after the first failure
        log_to_file=("stdout.log",
                     "stderr.log"),  #Redirecting stdout and stderr to files
        trial_name_creator=tune.function(trial_name_string),
        trial_dirname_creator=tune.function(trial_dir_string),
        resume=b_resume,
        config={
            "lr":
            tune.grid_search([0.007, 0.014, 0.021,
                              0.28]),  # ,0.007, 0.014, 0.028,0.056
            "iter_num":
            tune.grid_search([30000]),  # , 60000,90000,
            "batch_size":
            tune.grid_search([8, 16, 32, 48, 96]),  # 8,16,32 16, 32, 64, 128
            "backbone":
            tune.grid_search(backbones),
            "buffer_size":
            tune.grid_search([300]),  # 600
            "training_data_per":
            tune.grid_search([0.9]),  #, 0.8
            "data_augmentation":
            tune.grid_search(['blur,crop,bright,contrast,noise']),
            'data_aug_ignore_classes':
            tune.grid_search(['class_0'])
        }
        # config={
        #     "lr": tune.grid_search([0.014]),   # ,0.007, 0.014, 0.028,0.056
        #     "iter_num": tune.grid_search([30000]), # , 60000,90000
        #     "batch_size": tune.grid_search([8]), # 16, 32, 64, 128
        #     "backbone": tune.grid_search(backbones),
        #     "buffer_size": tune.grid_search([300]),
        #     "training_data_per": tune.grid_search([0.9]),
        #     "data_augmentation": tune.grid_search(['scale, bright, contrast, noise']),
        #     'data_aug_ignore_classes':tune.grid_search(['class_0',''])
        # }
    )

    print("Best config: ",
          analysis.get_best_config(metric="overall_miou", mode="max"))

    # Get a dataframe for analyzing trial results.
    df = analysis.results_df
    output_file = 'training_miou_ray_tune_%s.xlsx' % (
        datetime.now().strftime("%Y%m%d_%H%M%S"))
    with pd.ExcelWriter(output_file) as writer:
        df.to_excel(writer)  # , sheet_name='accuracy table'
        # set format
        # workbook = writer.book
        # format = workbook.add_format({'num_format': '#0.000'})
        # acc_talbe_sheet = writer.sheets['accuracy table']
        # acc_talbe_sheet.set_column('G:I',None,format)
        print('write trial results to %s' % output_file)
Пример #13
0
def setup(env,
          hparams,
          algorithm,
          train_batch_size,
          num_cpus,
          num_gpus,
          num_agents,
          use_gpus_for_workers=False,
          use_gpu_for_driver=False,
          num_workers_per_device=1):

    if env == 'harvest':

        def env_creator(_):
            return HarvestEnv(num_agents=num_agents)

        single_env = HarvestEnv()
    else:

        def env_creator(_):
            return CleanupEnv(num_agents=num_agents)

        single_env = CleanupEnv()

    env_name = env + "_env"
    register_env(env_name, env_creator)

    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy():
        return (PPOPolicyGraph, obs_space, act_space, {})

    # Setup PPO with an ensemble of `num_policies` different policy graphs
    policy_graphs = {}
    for i in range(num_agents):
        policy_graphs['agent-' + str(i)] = gen_policy()

    def policy_mapping_fn(agent_id):
        return agent_id

    # register the custom model
    model_name = "conv_to_fc_net"
    ModelCatalog.register_custom_model(model_name, ConvToFCNet)

    agent_cls = get_agent_class(algorithm)
    config = agent_cls._default_config.copy()

    # information for replay
    config['env_config']['func_create'] = tune.function(env_creator)
    config['env_config']['env_name'] = env_name
    config['env_config']['run'] = algorithm

    # Calculate device configurations
    gpus_for_driver = int(use_gpu_for_driver)
    cpus_for_driver = 1 - gpus_for_driver
    if use_gpus_for_workers:
        spare_gpus = (num_gpus - gpus_for_driver)
        num_workers = int(spare_gpus * num_workers_per_device)
        num_gpus_per_worker = spare_gpus / num_workers
        num_cpus_per_worker = 0
    else:
        spare_cpus = (num_cpus - cpus_for_driver)
        num_workers = int(spare_cpus * num_workers_per_device)
        num_gpus_per_worker = 0
        num_cpus_per_worker = spare_cpus / num_workers

    # hyperparams
    config.update({
        "train_batch_size":
        train_batch_size,
        "horizon":
        1000,
        "lr_schedule": [[0, hparams['lr_init']],
                        [20000000, hparams['lr_final']]],
        "num_workers":
        num_workers,
        "num_gpus":
        gpus_for_driver,  # The number of GPUs for the driver
        "num_cpus_for_driver":
        cpus_for_driver,
        "num_gpus_per_worker":
        num_gpus_per_worker,  # Can be a fraction
        "num_cpus_per_worker":
        num_cpus_per_worker,  # Can be a fraction
        "entropy_coeff":
        hparams['entropy_coeff'],
        "multiagent": {
            "policy_graphs": policy_graphs,
            "policy_mapping_fn": tune.function(policy_mapping_fn),
        },
        "model": {
            "custom_model": "conv_to_fc_net",
            "use_lstm": True,
            "lstm_cell_size": 128
        }
    })
    return algorithm, env_name, config
Пример #14
0
    obs_space = test_env.observation_space
    act_space = test_env.action_space

    def gen_policy():
        return (PPOPolicyGraph, obs_space, act_space, {})

    # Setup PG with an ensemble of `num_policies` different policy graphs
    policy_graphs = {'av': gen_policy(), 'adversary': gen_policy()}

    def policy_mapping_fn(agent_id):
        return agent_id

    config.update({
        'multiagent': {
            'policy_graphs': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn)
        }
    })

    exp_tag = {
        "run": alg_run,
        "env": env_name,
        "config": {
            **config
        },
        "checkpoint_freq": 10,
        "max_failures": 999,
        "stop": {
            "training_iteration": 500
        },
        "num_samples": 1,
    policy_graphs = {
        a_id: gen_policy()
        for a_id in env_actor_configs["actors"].keys()
    }

    run_experiments({
        "MA-PPO-SSUI3CCARLA": {
            "run": "PPO",
            "env": env_name,
            "stop": {
                "training_iteration": args.num_iters
            },
            "config": {
                "log_level": "DEBUG",
                "num_sgd_iter": 10,
                "multiagent": {
                    "policy_graphs": policy_graphs,
                    "policy_mapping_fn":
                    tune.function(lambda agent_id: agent_id),
                },
                "num_workers": args.num_workers,
                "num_envs_per_worker": args.envs_per_worker,
                "sample_batch_size": args.sample_bs_per_worker,
                "train_batch_size": args.train_bs
            },
            "checkpoint_freq": 500,
            "checkpoint_at_end": True,
        }
    })
Пример #16
0
def on_episode_end(info):
    # print(info)
    episode = info['episode']
    # print(info)
    # trainer = info['trainer']
    base_env = info['env']
    episode.custom_metrics['ego_starting_distance'] = base_env.get_unwrapped(
    )[0].process.ego_starting_distance


num_worker_cpus = 11
tune.run(
    train,
    name='curriculum_test_1',
    trial_name_creator=tune.function(
        lambda trial: 'adaptive_2_.05delta_50target_10deadzone'),
    config={
        'num_gpus': 1,
        'num_workers': num_worker_cpus,
        'num_cpus_per_worker': 1,
        'num_gpus_per_worker': 1.0 / num_worker_cpus,
        'sample_batch_size': 200,
        'train_batch_size': int(2 * 60.0 / .05),
        'batch_mode':
        'truncate_episodes',  # 'complete_episodes',  # 'truncate_episodes',
        'timesteps_per_iteration': int(2 * 60 / .05),
        'sgd_minibatch_size': 128,
        # 'shuffle_sequences':       True,
        'num_sgd_iter': 30,
        'gamma': 0.99999,
        'lr': 0.0001,
Пример #17
0
    def train_model_on_task(self, task, task_viz, exp_dir, use_ray,
                            use_ray_logging, grace_period, num_hp_samplings,
                            local_mode, redis_address, lca_n,
                            **training_params):
        logger.info("Training dashboard: {}".format(get_env_url(task_viz)))
        t_id = task['id']

        trainable = self.get_trainable(use_ray_logging=use_ray_logging)
        past_tasks = training_params.pop('past_tasks')
        normalize = training_params.pop('normalize')
        augment_data = training_params.pop('augment_data')

        transformations = []
        if augment_data:
            transformations.extend([
                transforms.ToPILImage(),
                transforms.RandomHorizontalFlip(),
                transforms.RandomCrop(32, 4),
                transforms.ToTensor()
            ])
        t_trans = [[] for _ in range(len(task['split_names']))]
        t_trans[0] = transformations
        datasets = trainable._load_datasets(task, task['loss_fn'], past_tasks,
                                            t_trans, normalize)
        train_loader, eval_loaders = get_classic_dataloaders(
            datasets, training_params.pop('batch_sizes'))
        model = self.get_model(task_id=t_id,
                               x_dim=task['x_dim'],
                               n_classes=task['n_classes'],
                               descriptor=task['descriptor'],
                               dataset=eval_loaders[:2])

        if use_ray:
            if not ray.is_initialized():
                ray.init(address=redis_address)

            scheduler = None

            training_params['loss_fn'] = tune.function(
                training_params['loss_fn'])
            training_params['optim_func'] = tune.function(self.optim_func)

            init_model_path = os.path.join(exp_dir, 'model_initializations')
            model_file_name = '{}_init.pth'.format(training_params['name'])
            model_path = os.path.join(init_model_path, model_file_name)
            torch.save(model, model_path)

            training_params['model_path'] = model_path
            config = {
                **self.get_search_space(), 'training-params': training_params
            }
            if use_ray_logging:
                stop_condition = {
                    'training_iteration': training_params['n_it_max']
                }
                checkpoint_at_end = False
                keep_checkpoints_num = 1
                checkpoint_score_attr = 'min-Val nll'
            else:
                stop_condition = None
                # loggers = [JsonLogger, MyCSVLogger]
                checkpoint_at_end = False
                keep_checkpoints_num = None
                checkpoint_score_attr = None

            trainable = rename_class(trainable, training_params['name'])
            experiment = Experiment(
                name=training_params['name'],
                run=trainable,
                stop=stop_condition,
                config=config,
                resources_per_trial=self.ray_resources,
                num_samples=num_hp_samplings,
                local_dir=exp_dir,
                loggers=(JsonLogger, CSVLogger),
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr)

            analysis = tune.run(
                experiment,
                scheduler=scheduler,
                verbose=1,
                raise_on_failed_trial=True,
                # max_failures=-1,
                # with_server=True,
                # server_port=4321
            )
            os.remove(model_path)
            logger.info("Training dashboard: {}".format(get_env_url(task_viz)))

            all_trials = {t.logdir: t for t in analysis.trials}
            best_logdir = analysis.get_best_logdir('Val nll', 'min')
            best_trial = all_trials[best_logdir]

            # picked_metric = 'accuracy_0'
            # metric_names = {s: '{} {}'.format(s, picked_metric) for s in
            #                 ['Train', 'Val', 'Test']}

            logger.info('Best trial: {}'.format(best_trial))
            best_res = best_trial.checkpoint.result
            best_point = (best_res['training_iteration'], best_res['Val nll'])

            # y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss']
            y_keys = ['Val nll', 'Train nll']

            epoch_key = 'training_epoch'
            it_key = 'training_iteration'
            plot_res_dataframe(analysis, training_params['name'], best_point,
                               task_viz, epoch_key, it_key, y_keys)
            if 'entropy' in next(iter(analysis.trial_dataframes.values())):
                plot_res_dataframe(analysis, training_params['name'], None,
                                   task_viz, epoch_key, it_key, ['entropy'])
            best_model = self.get_model(task_id=t_id)
            best_model.load_state_dict(torch.load(best_trial.checkpoint.value))

            train_accs = analysis.trial_dataframes[best_logdir][
                'Train accuracy_0']
            best_t = best_res['training_iteration']
            t = best_trial.last_result['training_iteration']
        else:
            search_space = self.get_search_space()
            rand_config = list(generate_variants(search_space))[0][1]
            learner_params = rand_config.pop('learner-params', {})
            optim_params = rand_config.pop('optim')

            split_optims = training_params.pop('split_optims')
            if hasattr(model, 'set_h_params'):
                model.set_h_params(**learner_params)
            if hasattr(model, 'train_loader_wrapper'):
                train_loader = model.train_loader_wrapper(train_loader)

            loss_fn = task['loss_fn']
            if hasattr(model, 'loss_wrapper'):
                loss_fn = model.loss_wrapper(task['loss_fn'])

            prepare_batch = _prepare_batch
            if hasattr(model, 'prepare_batch_wrapper'):
                prepare_batch = model.prepare_batch_wrapper(
                    prepare_batch, t_id)

            optim_fact = partial(set_optim_params,
                                 optim_func=self.optim_func,
                                 optim_params=optim_params,
                                 split_optims=split_optims)
            if hasattr(model, 'train_func'):
                f = model.train_func
                t, metrics, b_state_dict = f(train_loader=train_loader,
                                             eval_loaders=eval_loaders,
                                             optim_fact=optim_fact,
                                             loss_fn=loss_fn,
                                             split_names=task['split_names'],
                                             viz=task_viz,
                                             prepare_batch=prepare_batch,
                                             **training_params)
            else:
                optim = optim_fact(model=model)
                t, metrics, b_state_dict = train(
                    model=model,
                    train_loader=train_loader,
                    eval_loaders=eval_loaders,
                    optimizer=optim,
                    loss_fn=loss_fn,
                    split_names=task['split_names'],
                    viz=task_viz,
                    prepare_batch=prepare_batch,
                    **training_params)
            train_accs = metrics['Train accuracy_0']
            best_t = b_state_dict['iter']
            if 'training_archs' in metrics:
                plot_trajectory(model.ssn.graph, metrics['training_archs'],
                                model.ssn.stochastic_node_ids, task_viz)
                weights = model.arch_sampler().squeeze()
                archs = model.ssn.get_top_archs(weights, 5)
                list_top_archs(archs, task_viz)
                list_arch_scores(self.arch_scores[t_id], task_viz)
                update_summary(self.arch_scores[t_id], task_viz, 'scores')

        if len(train_accs) > lca_n:
            lca_accs = []
            for i in range(lca_n + 1):
                if i in train_accs:
                    lca_accs.append(train_accs[i])
                else:
                    logger.warning(
                        'Missing step for {}/{} for lca computation'.format(
                            i, lca_n))
            lca = np.mean(lca_accs)
        else:
            lca = np.float('nan')
        stats = {}
        start = time.time()
        # train_idx = task['split_names'].index('Train')
        # train_path = task['data_path'][train_idx]
        # train_dataset = _load_datasets([train_path])[0]
        train_dataset = _load_datasets(task, 'Train')[0]
        stats.update(
            self.finish_task(train_dataset, t_id, task_viz, path='drawings'))
        stats['duration'] = {
            'iterations': t,
            'finish': time.time() - start,
            'best_iterations': best_t
        }
        stats['params'] = {
            'total': self.n_params(t_id),
            'new': self.new_params(t_id)
        }
        stats['lca'] = lca
        return stats
Пример #18
0
def setup_exps_rllib(flow_params,
                     n_cpus,
                     n_rollouts,
                     policy_graphs=None,
                     policy_mapping_fn=None,
                     policies_to_train=None,
                     flags=None):
    from ray import tune
    from ray.tune.registry import register_env
    try:
        from ray.rllib.agents.agent import get_agent_class
    except ImportError:
        from ray.rllib.agents.registry import get_agent_class
    import torch
    horizon = flow_params['env'].horizon
    if flags.algorithm.lower() == "ppo":
        alg_run = "PPO"
        agent_cls = get_agent_class(alg_run)
        config = deepcopy(agent_cls._default_config)
        config["num_workers"] = n_cpus
        #config['num_gpus'] = 0.25
        config["horizon"] = horizon
        #config["train_batch_size"] = horizon * n_rollouts
        
        if flags.exp_config== 'singleagent_ring':
            config["gamma"] = 0.99  # discount rate
            config["use_gae"] = True  # truncated
            config["lambda"] = 0.99  # truncated value
            config["kl_target"] = 0.02  # d_target
            config["num_sgd_iter"] = 15
            config["sgd_minibatch_size"] = 1024
            config['lr'] = 5e-7
            config["clip_param"] = 0.2
            config["rollout_fragment_length"] = 3000
            config['train_batch_size'] = 3000


        elif flags.exp_config=='singleagent_figure_eight':
            config["gamma"] = 0.99  # discount rate
            config["use_gae"] = True  # truncated
            config["lambda"] = 1.0  # truncated value 0.97
            config["kl_target"] = 0.01  # d_target 0.02
            config["num_sgd_iter"] = 30
            config['sgd_minibatch_size'] = 64
            config["clip_param"] = 0.2
            config['lr'] = 1e-05
            config['train_batch_size'] = 512

            #deep network
            config['model']['fcnet_hiddens'] = [64, 64]

            #Exploration
            config['exploration_config']["type"] = "GaussianNoise"
            config['exploration_config']["initial_scale"] = 1.0
            config['exploration_config']["final_scale"] = 0.05
            config['exploration_config']["scale_timesteps"] = 1000000
            config['exploration_config']["random_timesteps"] = 1000
            config['exploration_config']["stddev"] = 0.1

        elif flags.exp_config=='singleagent_merge':
            config["gamma"] = 0.99  # discount rate
            config["use_gae"] = True  # truncated
            config["lambda"] = 0.97  # truncated value 0.97
            config["kl_target"] = 0.02  # d_target 0.02
            config["num_sgd_iter"] = 30
            config["sgd_minibatch_size"] = 64
            config['lr'] = 1e-7
            config["clip_param"] = 0.2
            config["train_batch_size"] = 256

            # deep network
            config['model']['fcnet_hiddens'] = [64, 64]

            # Exploration
            config['exploration_config']["type"] = "GaussianNoise"
            config['exploration_config']["initial_scale"] = 1.0
            config['exploration_config']["final_scale"] = 0.05
            config['exploration_config']["scale_timesteps"] = 3000000
            config['exploration_config']["random_timesteps"] = 1000
            config['exploration_config']["stddev"] = 0.1

    elif flags.algorithm.lower() == "ddpg":
        from ray.rllib.agents.ddpg.ddpg import DEFAULT_CONFIG
        alg_run = "DDPG"
        agent_cls = get_agent_class(alg_run)
        config = deepcopy(agent_cls._default_config)
        config["num_workers"] = n_cpus
        #config["train_batch_size"] = horizon * n_rollouts
        # model
        if flags.exp_config == 'singleagent_ring':
            config['n_step'] = 1
            config['actor_hiddens'] = [64, 64]
            config['actor_lr'] = 0.0001  # in article 'ddpg'
            config['critic_lr'] = 0.0001
            config['critic_hiddens'] = [64, 64]
            config['gamma'] = 0.99
            del config['model']['fcnet_hiddens']
            del config['model']['fcnet_activation']
            config['lr'] = 0.0001
            # exploration
            config['exploration_config']['final_scale'] = 0.05
            config['exploration_config']['scale_timesteps'] = 900000
            config['exploration_config']['ou_base_scale'] = 0.1
            config['exploration_config']['ou_theta'] = 0.15
            config['exploration_config']['ou_sigma'] = 0.2
            # optimization
            config['tau'] = 0.001
            config['l2_reg'] = 1e-6
            config['train_batch_size'] = 64
            config['learning_starts'] = 3000
            # evaluation
            # config['evaluation_interval'] = 5
            config['buffer_size'] = 300000  # 3e5
            config['timesteps_per_iteration'] = 3000
            config['prioritized_replay'] = False
            #config["prioritized_replay_beta_annealing_timesteps"]=2200000
            #config['final_prioritized_replay_beta']=0.01

        elif flags.exp_config=='singleagent_figure_eight':
            config['n_step'] = 1
            config['actor_hiddens'] = [64, 64]
            config['actor_lr'] = 0.00001  # in article 'ddpg'
            config['critic_lr'] = 0.0001
            config['critic_hiddens'] = [64, 64]
            config['gamma'] = 0.99
            # config['model']['fcnet_hiddens'] = [256, 256]
            config['lr'] = 1e-5
            #exploration
            config['exploration_config']['final_scale'] = 0.02
            config['exploration_config']['scale_timesteps'] = 1500000
            config['exploration_config']["initial_scale"] = 1.0
            config['exploration_config']["random_timesteps"] = 1000
            config['exploration_config']["stddev"] = 0.1
            del config['exploration_config']['ou_base_scale']
            del config['exploration_config']['ou_theta']
            del config['exploration_config']['ou_sigma']
            config['exploration_config']['type'] = 'GaussianNoise'
            # optimization
            config['tau'] = 0.001
            config['l2_reg'] = 1e-6
            config['train_batch_size'] = 256
            config['learning_starts'] = 3000
            config['target_network_update_freq'] = 100000
            # evaluation
            config['timesteps_per_iteration'] = 3000
            #config['evaluation_interval'] = 5
            config['buffer_size'] = 300000
            config["prioritized_replay_beta_annealing_timesteps"] = 100000
            config['prioritized_replay'] = True
        else:# merge
            config['n_step'] = 1
            config['actor_hiddens'] = [32, 32]
            config['actor_lr'] = 0.00001  # in article 'ddpg'
            config['critic_lr'] = 0.0001
            config['critic_hiddens'] = [32, 32]
            config['gamma'] = 0.99
            config['lr'] = 1e-5
            # exploration
            config['exploration_config']['final_scale'] = 0.02
            config['exploration_config']['scale_timesteps'] = 2100000
            config['exploration_config']['ou_base_scale'] = 0.1
            config['exploration_config']['ou_theta'] = 0.15
            config['exploration_config']['ou_sigma'] = 0.2
            # optimization
            config['tau'] = 0.001
            config['l2_reg'] = 1e-6
            config['train_batch_size'] = 128
            config['learning_starts'] = 3000
            config['target_network_update_freq'] = 3000
            # evaluation
            #config['evaluation_interval'] = 5
            config['buffer_size'] = 300000 #3e5
            config['timesteps_per_iteration'] = 3000
            config['prioritized_replay'] = False

    elif flags.algorithm.lower() == "td3":
        from ray.rllib.agents.ddpg.td3 import TD3Trainer
        alg_run = "TD3"
        agent_cls = get_agent_class(alg_run)
        config = deepcopy(agent_cls._default_config)
        config["num_workers"] = n_cpus
        #config["train_batch_size"] = horizon * n_rollouts
        
        # model
        if flags.exp_config== 'singleagent_ring':

            config['n_step'] = 1
            config['actor_hiddens'] = [64, 64]
            config['actor_lr'] = 0.00001
            config['critic_lr'] = 0.0001
            config['critic_hiddens'] = [64, 64]
            config['gamma'] = 0.99
            config['lr'] = 0.00001
            # TD3
            config['twin_q'] = True
            config['policy_delay'] = 2
            config['smooth_target_policy'] = True
            config['target_noise'] = 0.1  # default 0.2
            config['target_noise_clip'] = 0.5
            # Policy Optimizer
            # config['optimizer'] = 'Adam'
            # exploration
            config['exploration_config']['final_scale'] = 0.05  # default 1
            config['exploration_config']['scale_timesteps'] = 1500000  # 900000 # default 1
            config['exploration_config']["initial_scale"] = 1.0
            config['exploration_config']["random_timesteps"] = 1000  # default 10000
            config['exploration_config']["stddev"] = 0.1
            config['exploration_config']['type'] = 'GaussianNoise'
            # optimization
            config['tau'] = 0.001  # best; fix
            config['l2_reg'] = 0
            config['train_batch_size'] = 128  # default 100; best 128
            config['learning_starts'] = 10000
            config['use_huber'] = False
            # evaluation
            # config['evaluation_interval'] = 5
            config['buffer_size'] = 300000  # default 1000000
            config['timesteps_per_iteration'] = 3000
            config['prioritized_replay'] = False
            config['worker_side_prioritization'] = False
            config['use_state_preprocessor'] = False

        elif flags.exp_config == 'singleagent_figure_eight':
            # TD3
            config['twin_q'] = True
            config['policy_delay'] = 2
            config['smooth_target_policy'] = True
            config['target_noise'] = 0.2  # default 0.2
            config['target_noise_clip'] = 0.5
            # model
            config['n_step'] = 1
            config['actor_hiddens'] = [256, 256]
            config['actor_lr'] = 0.000001
            config['critic_lr'] = 0.00001
            config['critic_hiddens'] = [256, 256]
            config['gamma'] = 0.99
            config['lr'] = 0.000001
            # config['model']['fcnet_hiddens'] = [64, 64]
            # exploration
            config['exploration_config']['type'] = 'GaussianNoise'
            config['exploration_config']['final_scale'] = 0.05  # default 1
            config['exploration_config']['scale_timesteps'] = 9000000  # 900000 # default 1
            config['exploration_config']["initial_scale"] = 1
            config['exploration_config']["random_timesteps"] = 1000  # default 10000
            config['exploration_config']["stddev"] = 0.1
            # optimization
            config['tau'] = 0.001  # best; fix
            config['l2_reg'] = 1e-6
            config['train_batch_size'] = 128  # default 100; best 128
            config['learning_starts'] = 10000
            config['use_huber'] = False
            config['target_network_update_freq'] = 50000
            # evaluation
            # config['evaluation_interval'] = 5
            config['buffer_size'] = 300000  # default 1000000
            config['timesteps_per_iteration'] = 3000
            config['prioritized_replay'] = False
            config['worker_side_prioritization'] = False
            config['use_state_preprocessor'] = False
        elif flags.exp_config == 'singleagent_merge':
            # TD3
            config['twin_q'] = True
            config['policy_delay'] = 2
            config['smooth_target_policy'] = True
            config['target_noise'] = 0.2  # default 0.2
            config['target_noise_clip'] = 0.5
            # model
            config['n_step'] = 1
            config['actor_hiddens'] = [64, 64]
            config['actor_lr'] = 0.000001
            config['critic_lr'] = 0.00001
            config['critic_hiddens'] = [64, 64]
            config['gamma'] = 0.99
            config['lr'] = 0.000001
            # config['model']['fcnet_hiddens'] = [64, 64]
            # exploration
            config['exploration_config']['type'] = 'GaussianNoise'
            config['exploration_config']['final_scale'] = 0.05  # default 1
            config['exploration_config']['scale_timesteps'] = 10000000  # 900000 # default 1
            config['exploration_config']["initial_scale"] = 1
            config['exploration_config']["random_timesteps"] = 1000  # default 10000
            config['exploration_config']["stddev"] = 0.1
            # optimization
            config['tau'] = 0.001  # best; fix
            config['l2_reg'] = 1e-6
            config['train_batch_size'] = 128  # default 100; best 128
            config['learning_starts'] = 10000
            config['use_huber'] = False
            config['target_network_update_freq'] = 1
            # evaluation
            # config['evaluation_interval'] = 5
            config['buffer_size'] = 300000  # default 1000000
            config['timesteps_per_iteration'] = 5000
            config['prioritized_replay'] = False
            config['worker_side_prioritization'] = False
            config['use_state_preprocessor'] = False



    #common config
    config['framework']='torch'
    config['callbacks'] = {
        "on_episode_end": None,
        "on_episode_start": None,
        "on_episode_step": None,
        "on_postprocess_traj": None,
        "on_sample_end": None,
        "on_train_result": None
    }  
    # config["opt_type"]= "adam" for impala and APPO, default is SGD
    # TrainOneStep class call SGD -->execution_plan function can have policy update function
    print("cuda is available: ", torch.cuda.is_available())
    print('Beginning training.')
    print("==========================================")
    print("running algorithm: ", alg_run)  # "Framework: ", "torch"

    # save the flow params for replay
    flow_json = json.dumps(
        flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    # multiagent configuration
    if policy_graphs is not None:
        print("policy_graphs", policy_graphs)
        config['multiagent'].update({'policies': policy_graphs})
    if policy_mapping_fn is not None:
        config['multiagent'].update(
            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
    if policies_to_train is not None:
        config['multiagent'].update({'policies_to_train': policies_to_train})

    create_env, gym_name = make_create_env(params=flow_params)

    # Register as rllib env
    register_env(gym_name, create_env)
    return alg_run, gym_name, config
def setup_exps(flow_params):
    """Create the relevant components of a multiagent RLlib experiment.

    Parameters
    ----------
    flow_params : dict
        input flow-parameters

    Returns
    -------
    str
        name of the training algorithm
    str
        name of the gym environment to be trained
    dict
        training configuration parameters
    """
    alg_run = 'PPO'
    agent_cls = get_agent_class(alg_run)
    config = agent_cls._default_config.copy()
    config['num_workers'] = N_CPUS
    config['train_batch_size'] = HORIZON * N_ROLLOUTS
    config['sgd_minibatch_size'] = 4096
    #config['simple_optimizer'] = True
    config['gamma'] = 0.998  # discount rate
    config['model'].update({'fcnet_hiddens': [100, 50, 25]})
    #config['lr'] = tune.grid_search([5e-4, 1e-4])
    config['lr_schedule'] = [
        [0, 1e-4],
        [2000000, 5e-5],
    ]
    config['horizon'] = HORIZON
    config['clip_actions'] = False
    config['observation_filter'] = 'NoFilter'
    config["use_gae"] = True
    config["lambda"] = 0.95
    config["shuffle_sequences"] = True
    config["vf_clip_param"] = 1e8
    config["num_sgd_iter"] = 10
    #config["kl_target"] = 0.003
    config["kl_coeff"] = 0.01
    config["entropy_coeff"] = 0.001
    config["clip_param"] = 0.2
    config["grad_clip"] = None
    config["use_critic"] = True
    config["vf_share_layers"] = True
    config["vf_loss_coeff"] = 0.5

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    create_env, env_name = make_create_env(params=flow_params, version=0)

    # register as rllib env
    register_env(env_name, create_env)

    # multiagent configuration
    temp_env = create_env()
    policy_graphs = {
        'av':
        (PPOTFPolicy, temp_env.observation_space, temp_env.action_space, {})
    }

    def policy_mapping_fn(_):
        return 'av'

    config.update({
        'multiagent': {
            'policies': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn),
            'policies_to_train': ['av']
        }
    })

    return alg_run, env_name, config
Пример #20
0
def setup(algorithm,
          train_batch_size,
          num_cpus,
          num_gpus,
          num_agents,
          use_gpus_for_workers=False,
          use_gpu_for_driver=False,
          num_workers_per_device=1):
    def env_creator(_):
        return MatrixEnv(matrix_game)

    single_env = MatrixEnv(matrix_game)

    env_name = "{}_adv".format(matrix_game)
    register_env(env_name, env_creator)

    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy():
        return (PPOPolicyGraph, obs_space, act_space, {})

    # Setup PPO with an ensemble of `num_policies` different policy graphs
    policy_graphs = {}
    for i in range(num_agents):
        policy_graphs['agent-' + str(i)] = gen_policy()

    def policy_mapping_fn(agent_id):
        return agent_id

    # register the custom model
    model_name = "matrix_fc_net"
    ModelCatalog.register_custom_model(model_name, FCNet)

    agent_cls = get_agent_class(algorithm)
    config = agent_cls._default_config.copy()

    # information for replay
    config['env_config']['func_create'] = tune.function(env_creator)
    config['env_config']['env_name'] = env_name
    config['env_config']['run'] = algorithm

    # Calculate device configurations
    gpus_for_driver = int(use_gpu_for_driver)
    cpus_for_driver = 1 - gpus_for_driver
    if use_gpus_for_workers:
        spare_gpus = (num_gpus - gpus_for_driver)
        num_workers = int(spare_gpus * num_workers_per_device)
        num_gpus_per_worker = spare_gpus / num_workers
        num_cpus_per_worker = 0
    else:
        spare_cpus = (num_cpus - cpus_for_driver)
        num_workers = int(spare_cpus * num_workers_per_device)
        num_gpus_per_worker = 0
        num_cpus_per_worker = spare_cpus / num_workers

    # hyperparams
    config.update({
        "train_batch_size": 30000,
        "horizon": 100,
        "lr": 0.001,
        "num_workers": num_workers,
        "num_gpus": gpus_for_driver,  # The number of GPUs for the driver
        "num_cpus_for_driver": cpus_for_driver,
        "num_gpus_per_worker": num_gpus_per_worker,  # Can be a fraction
        "num_cpus_per_worker": num_cpus_per_worker,  # Can be a fraction
        #"entropy_coeff": hparams['entropy_coeff'],
        "multiagent": {
            "policy_graphs": policy_graphs,
            "policy_mapping_fn": tune.function(policy_mapping_fn),
        },
        "model": {
            "custom_model": "matrix_fc_net",
            "use_lstm": True,
            "lstm_cell_size": 128
        },
        "callbacks": {
            "on_episode_start": tune.function(on_episode_start),
            "on_episode_step": tune.function(on_episode_step),
            "on_episode_end": tune.function(on_episode_end)
        }
    })
    return algorithm, env_name, config
Пример #21
0
def setup_exps_rllib(flow_params,
                     n_cpus,
                     n_rollouts,
                     policy_graphs=None,
                     policy_mapping_fn=None,
                     policies_to_train=None):
    """Return the relevant components of an RLlib experiment.

    Parameters
    ----------
    flow_params : dict
        flow-specific parameters (see flow/utils/registry.py)
    n_cpus : int
        number of CPUs to run the experiment over
    n_rollouts : int
        number of rollouts per training iteration
    policy_graphs : dict, optional
        TODO
    policy_mapping_fn : function, optional
        TODO
    policies_to_train : list of str, optional
        TODO

    Returns
    -------
    str
        name of the training algorithm
    str
        name of the gym environment to be trained
    dict
        training configuration parameters
    """
    from ray import tune
    from ray.tune.registry import register_env
    try:
        from ray.rllib.agents.agent import get_agent_class
    except ImportError:
        from ray.rllib.agents.registry import get_agent_class

    horizon = flow_params['env'].horizon

    alg_run = "PPO"

    agent_cls = get_agent_class(alg_run)
    config = deepcopy(agent_cls._default_config)

    config["num_workers"] = n_cpus
    config["train_batch_size"] = horizon * n_rollouts
    config["gamma"] = 0.999  # discount rate
    config["model"].update({"fcnet_hiddens": [32, 32, 32]})
    config["use_gae"] = True
    config["lambda"] = 0.97
    config["kl_target"] = 0.02
    config["num_sgd_iter"] = 10
    config["horizon"] = horizon

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    # multiagent configuration
    if policy_graphs is not None:
        print("policy_graphs", policy_graphs)
        config['multiagent'].update({'policies': policy_graphs})
    if policy_mapping_fn is not None:
        config['multiagent'].update(
            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
    if policies_to_train is not None:
        config['multiagent'].update({'policies_to_train': policies_to_train})

    create_env, gym_name = make_create_env(params=flow_params)

    # Register as rllib env
    register_env(gym_name, create_env)
    return alg_run, gym_name, config
Пример #22
0
        "lr": 0.0005,
        "adam_epsilon": 0.0015,
        "schedule_max_timesteps": 10**7,
        "exploration_final_eps": 0.02,
        "exploration_fraction": 0.1,
        "buffer_size": 10**5,
        "target_network_update_freq": 50000,
        "sample_batch_size": 16,
        "train_batch_size": 64,
        "observation_filter": "MeanStdFilter",
        "num_workers": 2,
        "num_envs_per_worker": 16,
        "num_cpus_per_worker": 2,
        "num_cpus_for_driver": 1,
        "num_gpus": 0,
        "multiagent": {
            "policy_graphs": policy_graphs,
            "policy_mapping_fn": tune.function(policy_mapping_fn)
        },
        "model": {
            "fcnet_activation": "tanh",
            "fcnet_hiddens": [32, 32],
        },
        #"callbacks": {
        #"on_episode_start": tune.function(on_episode_start),
        #"on_episode_step": tune.function(on_episode_step),
        #"on_episode_end": tune.function(on_episode_end),
        #},
    },
)
Пример #23
0
 "asynchyperband_EC_dqn_v2_1": {
     "run": 'DQN',
     "env": 'ECglass-v2',
     "stop": {
         "timesteps_total": 876000,
         # "training_iteration": 1 if args.smoke_test else 99999
     },
     "num_samples": 5,
     "resources_per_trial": {
         "cpu": 0.8,
         "gpu": 0.2
     },
     "config": {
         "hiddens": tune.grid_search([[1024, 512], [2048, 1024]]),
         "callbacks": {
             "on_episode_start": tune.function(on_episode_start),
             "on_episode_step": tune.function(on_episode_step),
             "on_episode_end": tune.function(on_episode_end),
         },
         "learning_starts": 64,
         "buffer_size": 1000000,
         "exploration_fraction": 1,
         "train_batch_size": tune.grid_search([250, 200]),
         "gamma": 0,
         "exploration_final_eps": tune.grid_search([0.03]),
         #"exploration_final_eps": 0.05,
         #"num_workers": 2,
         "lr": tune.grid_search([0.000001, 0.00005]),
         "target_network_update_freq": tune.grid_search(
             [16000, 18000]),
         "timesteps_per_iteration": 8760,
Пример #24
0
def on_episode_end(info):
    # print(info)
    episode = info['episode']
    # print(info)
    # trainer = info['trainer']
    base_env = info['env']
    episode.custom_metrics['ego_starting_distance'] = base_env.get_unwrapped(
    )[0].process.ego_starting_distance


num_worker_cpus = 11
tune.run(
    train,
    name='curriculum_test_1',
    trial_name_creator=tune.function(
        lambda trial: 'adaptive_20m_0-1delta_75target'),
    config={
        'num_gpus': 1,
        'num_workers': num_worker_cpus,
        'num_cpus_per_worker': 1,
        'num_gpus_per_worker': 1.0 / num_worker_cpus,
        'sample_batch_size': 200,
        'train_batch_size': int(2 * 60.0 / .05),
        'batch_mode':
        'truncate_episodes',  # 'complete_episodes',  # 'truncate_episodes',
        'timesteps_per_iteration': int(2 * 60 / .05),
        'sgd_minibatch_size': 128,
        # 'shuffle_sequences':       True,
        'num_sgd_iter': 30,
        'gamma': 0.99999,
        'lr': 0.0001,
Пример #25
0
    single_env = SimpleMultiAgentEnv(env_config={"scenario_name": args.scenario_name})

    # Policy Mapping
    policies = {
        agent: (None, single_env.observation_space[agent], single_env.action_space[agent], {
            "observation_spaces": single_env.observation_space,
            "action_spaces": single_env.action_space,
            "agent_id": agent
        }) for agent in single_env.agent_ids
    }

    # Start training
    ray.init()
    tune.run(
        MADDPGTrainer,
        stop={
            "timesteps_total": 1000000,
        },
        config={
            "env": "simple_multiagent",
            "env_config": {
                "scenario_name": args.scenario_name,
                "time_limit": 100
            },
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": tune.function(lambda agent_id: agent_id),
            },
            #  "observation_filter": "NoFilter",
        })
Пример #26
0
def setup_exps(flow_params, evaluate=False):
    """Create the relevant components of a multiagent RLlib experiment.

    Parameters
    ----------
    flow_params : dict
        input flow-parameters

    Returns
    -------
    str
        name of the training algorithm
    str
        name of the gym environment to be trained
    dict
        training configuration parameters
    """
    alg_run = 'PPO'
    agent_cls = get_agent_class(alg_run)
    config = agent_cls._default_config.copy()
    config['num_workers'] = N_CPUS
    config['train_batch_size'] = HORIZON * N_ROLLOUTS
    config['gamma'] = 0.999  # discount rate
    config['model'].update({'fcnet_hiddens': [256, 256]})
    config['lr'] = 2e-5
    config['clip_actions'] = False
    config['observation_filter'] = 'NoFilter'
    config['simple_optimizer'] = True
    config['horizon'] = HORIZON

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    flow_params['env'].evaluate = evaluate
    create_env, env_name = make_create_env(params=flow_params, version=0)

    # register as rllib env
    register_env(env_name, create_env)

    # multiagent configuration
    temp_env = create_env()
    policy_graphs = {
        'av':
        (PPOPolicyGraph, temp_env.observation_space, temp_env.action_space, {})
    }

    def policy_mapping_fn(_):
        return 'av'

    config.update({
        'multiagent': {
            'policy_graphs': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn),
            'policies_to_train': ['av']
        }
    })

    return alg_run, env_name, config
                                            "model": {
                                                "fcnet_hiddens": [256, 256, 256, 256],
                                                "custom_preprocessor": "ohe",
                                                "custom_options": {},  # extra options to pass to your preprocessor
                                                "fcnet_activation": 'tanh',
                                                "use_lstm": True,
                                                "max_seq_len": delay + sequence_length,
                                                "lstm_cell_size": lstm_cell_size,
                                                "lstm_use_prev_action_reward": lstm_use_prev_action_reward,
                                                },
                                                      "callbacks": {
                                        #                 "on_episode_start": tune.function(on_episode_start),
                                        #                 "on_episode_step": tune.function(on_episode_step),
                                        #                 "on_episode_end": tune.function(on_episode_end),
                                        #                 "on_sample_end": tune.function(on_sample_end),
                                                        "on_train_result": tune.function(on_train_result),
                                        #                 "on_postprocess_traj": tune.function(on_postprocess_traj),
                                                    },
                                            #                                    "evaluation_config": {
                                            #'seed': 0, #seed
                                            #                                    "exploration_fraction": 0,
                                            #                                    "exploration_final_eps": 0
                                            #                                    },
                                            # "output": return_hack_writer,
                                            # "output_compress_columns": [],
                                            },
                                         #return_trials=True # add trials = tune.run( above
                                         )
                                        # ag.train()

end = time.time()
Пример #28
0
    def _init(self, config, env_creator):
        self._validate_config()

        # Update effective batch size to include n-step
        adjusted_batch_size = max(config["sample_batch_size"],
                                  config.get("n_step", 1))
        config["sample_batch_size"] = adjusted_batch_size

        self.exploration0 = self._make_exploration_schedule(-1)
        self.explorations = [
            self._make_exploration_schedule(i)
            for i in range(config["num_workers"])
        ]

        for k in self._optimizer_shared_configs:
            if self._name != "DQN" and k in [
                    "schedule_max_timesteps", "beta_annealing_fraction",
                    "final_prioritized_replay_beta"
            ]:
                # only Rainbow needs annealing prioritized_replay_beta
                continue
            if k not in config["optimizer"]:
                config["optimizer"][k] = config[k]

        if config.get("parameter_noise", False):
            if config["callbacks"]["on_episode_start"]:
                start_callback = config["callbacks"]["on_episode_start"]
            else:
                start_callback = None

            def on_episode_start(info):
                # as a callback function to sample and pose parameter space
                # noise on the parameters of network
                policies = info["policy"]
                for pi in policies.values():
                    pi.add_parameter_noise()
                if start_callback:
                    start_callback(info)

            config["callbacks"]["on_episode_start"] = tune.function(
                on_episode_start)
            if config["callbacks"]["on_episode_end"]:
                end_callback = config["callbacks"]["on_episode_end"]
            else:
                end_callback = None

            def on_episode_end(info):
                # as a callback function to monitor the distance
                # between noisy policy and original policy
                policies = info["policy"]
                episode = info["episode"]
                episode.custom_metrics["policy_distance"] = policies[
                    DEFAULT_POLICY_ID].pi_distance
                if end_callback:
                    end_callback(info)

            config["callbacks"]["on_episode_end"] = tune.function(
                on_episode_end)

        self.local_evaluator = self.make_local_evaluator(
            env_creator, self._policy_graph)

        if config["evaluation_interval"]:
            self.evaluation_ev = self.make_local_evaluator(
                env_creator,
                self._policy_graph,
                extra_config={
                    "batch_mode": "complete_episodes",
                    "batch_steps": 1,
                })
            self.evaluation_metrics = self._evaluate()

        def create_remote_evaluators():
            return self.make_remote_evaluators(env_creator, self._policy_graph,
                                               config["num_workers"])

        if config["optimizer_class"] != "AsyncReplayOptimizer":
            self.remote_evaluators = create_remote_evaluators()
        else:
            # Hack to workaround https://github.com/ray-project/ray/issues/2541
            self.remote_evaluators = None

        self.optimizer = getattr(optimizers, config["optimizer_class"])(
            self.local_evaluator, self.remote_evaluators,
            **config["optimizer"])
        # Create the remote evaluators *after* the replay actors
        if self.remote_evaluators is None:
            self.remote_evaluators = create_remote_evaluators()
            self.optimizer._set_evaluators(self.remote_evaluators)

        self.last_target_update_ts = 0
        self.num_target_updates = 0
Пример #29
0
    def _init(self):
        self._validate_config()

        # Update effective batch size to include n-step
        adjusted_batch_size = max(self.config["sample_batch_size"],
                                  self.config.get("n_step", 1))
        self.config["sample_batch_size"] = adjusted_batch_size

        self.exploration0 = self._make_exploration_schedule(-1)
        self.explorations = [
            self._make_exploration_schedule(i)
            for i in range(self.config["num_workers"])
        ]

        for k in self._optimizer_shared_configs:
            if self._agent_name != "DQN" and k in [
                    "schedule_max_timesteps", "beta_annealing_fraction",
                    "final_prioritized_replay_beta"
            ]:
                # only Rainbow needs annealing prioritized_replay_beta
                continue
            if k not in self.config["optimizer"]:
                self.config["optimizer"][k] = self.config[k]

        if self.config.get("parameter_noise", False):
            if self.config["callbacks"]["on_episode_start"]:
                start_callback = self.config["callbacks"]["on_episode_start"]
            else:
                start_callback = None

            def on_episode_start(info):
                # as a callback function to sample and pose parameter space
                # noise on the parameters of network
                policies = info["policy"]
                for pi in policies.values():
                    pi.add_parameter_noise()
                if start_callback:
                    start_callback(info)

            self.config["callbacks"]["on_episode_start"] = tune.function(
                on_episode_start)
            if self.config["callbacks"]["on_episode_end"]:
                end_callback = self.config["callbacks"]["on_episode_end"]
            else:
                end_callback = None

            def on_episode_end(info):
                # as a callback function to monitor the distance
                # between noisy policy and original policy
                policies = info["policy"]
                episode = info["episode"]
                episode.custom_metrics["policy_distance"] = policies[
                    "default"].pi_distance
                if end_callback:
                    end_callback(info)

            self.config["callbacks"]["on_episode_end"] = tune.function(
                on_episode_end)

        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)

        if self.config["evaluation_interval"]:
            self.evaluation_ev = self.make_local_evaluator(
                self.env_creator,
                self._policy_graph,
                extra_config={
                    "batch_mode": "complete_episodes",
                    "batch_steps": 1,
                })
            self.evaluation_metrics = self._evaluate()

        def create_remote_evaluators():
            return self.make_remote_evaluators(self.env_creator,
                                               self._policy_graph,
                                               self.config["num_workers"])

        if self.config["optimizer_class"] != "AsyncReplayOptimizer":
            self.remote_evaluators = create_remote_evaluators()
        else:
            # Hack to workaround https://github.com/ray-project/ray/issues/2541
            self.remote_evaluators = None

        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
            self.local_evaluator, self.remote_evaluators,
            self.config["optimizer"])
        # Create the remote evaluators *after* the replay actors
        if self.remote_evaluators is None:
            self.remote_evaluators = create_remote_evaluators()
            self.optimizer._set_evaluators(self.remote_evaluators)

        self.last_target_update_ts = 0
        self.num_target_updates = 0
Пример #30
0
model_config = {}

eval_config = {
    "evaluation_interval": 1,  # I think this means every x training_iterations
    "evaluation_config": {
        "explore": False,
        "evaluation_num_episodes": 10,
        "horizon": 100,
        "env_config": {
            "dummy_eval":
            True,  # hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats
            "transition_noise":
            0 if "state_space_type" in env_config["env_config"]
            and env_config["env_config"]["state_space_type"] == "discrete" else
            tune.function(lambda a: a.normal(0, 0)),
            "reward_noise":
            tune.function(lambda a: a.normal(0, 0)),
            "action_loss_weight":
            0.0,
        },
    },
}
value_tuples = []
for config_type, config_dict in var_configs.items():
    for key in config_dict:
        assert (
            isinstance(var_configs[config_type][key], list)
        ), "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values"
        value_tuples.append(var_configs[config_type][key])
def setup_exps_PPO(flow_params):
    """
    Experiment setup with PPO using RLlib.

    Parameters
    ----------
    flow_params : dictionary of flow parameters

    Returns
    -------
    str
        name of the training algorithm
    str
        name of the gym environment to be trained
    dict
        training configuration parameters
    """
    alg_run = 'PPO'
    agent_cls = get_agent_class(alg_run)
    config = agent_cls._default_config.copy()
    config["num_workers"] = min(N_CPUS, N_ROLLOUTS)
    config['train_batch_size'] = HORIZON * N_ROLLOUTS
    config['simple_optimizer'] = True
    config['gamma'] = 0.999  # discount rate
    config['model'].update({'fcnet_hiddens': [32, 32]})
    config['lr'] = tune.grid_search([1e-5, 1e-4, 1e-3])
    config['horizon'] = HORIZON
    config['clip_actions'] = False  # FIXME(ev) temporary ray bug
    config['observation_filter'] = 'NoFilter'

    # save the flow params for replay
    flow_json = json.dumps(
        flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    create_env, env_name = make_create_env(params=flow_params, version=0)

    # Register as rllib env
    register_env(env_name, create_env)

    test_env = create_env()
    obs_space = test_env.observation_space
    act_space = test_env.action_space

    def gen_policy():
        return (PPOPolicyGraph, obs_space, act_space, {})

    # Setup PG with a single policy graph for all agents
    policy_graphs = {'av': gen_policy()}

    def policy_mapping_fn(_):
        return 'av'

    config.update({
        'multiagent': {
            'policy_graphs': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn),
            'policies_to_train': ['av']
        }
    })

    return alg_run, env_name, config
Пример #32
0
            },
            "gamma": random.choice([0.95, 0.99]),
        }
        return (PPOPolicyGraph, obs_space, act_space, config)

    # Setup PPO with an ensemble of `num_policies` different policy graphs
    policy_graphs = {
        "policy_{}".format(i): gen_policy(i)
        for i in range(args.num_policies)
    }
    policy_ids = list(policy_graphs.keys())

    run_experiments({
        "test": {
            "run": "PPO",
            "env": "multi_cartpole",
            "stop": {
                "training_iteration": args.num_iters
            },
            "config": {
                "log_level": "DEBUG",
                "num_sgd_iter": 10,
                "multiagent": {
                    "policy_graphs": policy_graphs,
                    "policy_mapping_fn": tune.function(
                        lambda agent_id: random.choice(policy_ids)),
                },
            },
        }
    })
Пример #33
0
    def __init__(self, result_dir, checkpoint_num=150, algo='PPO'):

        checkpoint_num = str(checkpoint_num)
        # # config = get_rllib_config(result_dir)
        # # pkl = get_rllib_pkl(result_dir)

        #create_env, env_name = make_create_env(params=flow_params, version=0)

        # Register as rllib env
        register_env('test', create_env)

        obs_space = Box(low=0., high=1, shape=(94, ), dtype=np.float32)

        act_space = Box(low=-1, high=1, shape=(2, ), dtype=np.float32)

        adv_action_space = Box(low=0., high=1, shape=(22, ), dtype=np.float32)

        def gen_policy_agent():
            return (None, obs_space, act_space, {})

        def gen_policy_adversary():
            return (None, obs_space, adv_action_space, {})

        # <-- old
        # Setup PG with an ensemble of `num_policies` different policy graphs
        policy_graphs = {
            'av': gen_policy_agent(),
            'action_adversary': gen_policy_adversary()
        }

        def policy_mapping_fn(agent_id):
            return agent_id

        policy_ids = list(policy_graphs.keys())

        config = ppo.DEFAULT_CONFIG.copy()
        config['model'].update({'fcnet_hiddens': [100, 50, 25]})
        config["observation_filter"] = "NoFilter"
        config['simple_optimizer'] = True

        config.update({
            'multiagent': {
                'policy_graphs': policy_graphs,
                'policy_mapping_fn': tune.function(policy_mapping_fn)
            }
        })

        # check if we have a multiagent scenario but in a
        # backwards compatible way
        # if config.get('multiagent', {}).get('policy_graphs', {}):
        #     multiagent = True
        #     config['multiagent'] = pkl['multiagent']
        # else:
        #     multiagent = False

        # Run on only one cpu for rendering purposes
        config['num_workers'] = 0

        # flow_params = get_flow_params(config)

        # # Create and register a gym+rllib env
        # create_env, env_name = make_create_env(
        #     params=flow_params, version=0, render=False)
        # register_env(env_name, create_env)

        # Determine agent and checkpoint
        agent_cls = get_agent_class(algo)

        # create the agent that will be used to compute the actions
        self.agent = agent_cls(env='test', config=config)
        # agent = agent_cls(config=config)
        checkpoint = result_dir + '/checkpoint_' + checkpoint_num
        checkpoint = checkpoint + '/checkpoint-' + checkpoint_num
        self.agent.restore(checkpoint)

        multiagent = True
        if multiagent:
            rets = {}
            # map the agent id to its policy
            self.policy_map_fn = config['multiagent']['policy_mapping_fn']
            for key in config['multiagent']['policy_graphs'].keys():
                rets[key] = []
        else:
            rets = []
def setup_exps_rllib(flow_params,
                     n_cpus,
                     n_rollouts,
                     policy_graphs=None,
                     policy_mapping_fn=None,
                     policies_to_train=None,
                     flags=None):
    from ray import tune
    from ray.tune.registry import register_env
    try:
        from ray.rllib.agents.agent import get_agent_class
    except ImportError:
        from ray.rllib.agents.registry import get_agent_class
    import torch
    horizon = flow_params['env'].horizon

    from ray.rllib.agents.ddpg.ddpg import DEFAULT_CONFIG
    alg_run = "DDPG"
    agent_cls = get_agent_class(alg_run)
    config = deepcopy(agent_cls._default_config)
    config["num_workers"] = 1
    # model
    config['n_step'] = 1
    config['actor_hiddens'] = [64, 64]
    config['actor_lr'] = 0.0001  # in article 'ddpg'
    config['critic_lr'] = 0.0001
    config['critic_hiddens'] = [64, 64]
    config['gamma'] = 0.99
    config['model']['fcnet_hiddens'] = [64, 64]
    config['lr'] = 1e-5
    # exploration
    config['exploration_config']['final_scale'] = 0.05
    config['exploration_config']['scale_timesteps'] = 1500000
    config['exploration_config']['ou_base_scale'] = 0.1
    config['exploration_config']['ou_theta'] = 0.15
    config['exploration_config']['ou_sigma'] = 0.2
    # optimization
    config['tau'] = 0.001
    config['l2_reg'] = 1e-6
    config['train_batch_size'] = 64
    config['learning_starts'] = 3000
    # evaluation
    #config['evaluation_interval'] = 5
    config['buffer_size'] = 300000  #3e5
    config['timesteps_per_iteration'] = 3000
    config['prioritized_replay'] = False

    #common config
    config['framework'] = 'torch'
    config['callbacks'] = {
        "on_episode_end": None,
        "on_episode_start": None,
        "on_episode_step": None,
        "on_postprocess_traj": None,
        "on_sample_end": None,
        "on_train_result": None
    }
    # config["opt_type"]= "adam" for impala and APPO, default is SGD
    # TrainOneStep class call SGD -->execution_plan function can have policy update function
    print("cuda is available: ", torch.cuda.is_available())
    print('Beginning training.')
    print("==========================================")
    print("running algorithm: ", alg_run)  # "Framework: ", "torch"

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    # multiagent configuration
    if policy_graphs is not None:
        print("policy_graphs", policy_graphs)
        config['multiagent'].update({'policies': policy_graphs})
    if policy_mapping_fn is not None:
        config['multiagent'].update(
            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
    if policies_to_train is not None:
        config['multiagent'].update({'policies_to_train': policies_to_train})

    create_env, gym_name = make_create_env(params=flow_params)

    # Register as rllib env
    register_env(gym_name, create_env)
    return alg_run, gym_name, config
Пример #35
0
def setup_exps_rllib(flow_params,
                     n_cpus,
                     n_rollouts,
                     policy_graphs=None,
                     policy_mapping_fn=None,
                     policies_to_train=None,
                     flags=None):
    from ray import tune
    from ray.tune.registry import register_env
    try:
        from ray.rllib.agents.agent import get_agent_class
    except ImportError:
        from ray.rllib.agents.registry import get_agent_class
    import torch

    #bmil edit
    safety = float(flags.safety)
    if safety < 0 or safety > 2:
        raise ValueError('--safety option out of value')
    rate = safety - 1
    if safety > 1:
        flow_params['initial'].reward_params['simple_lc_penalty'] *= (1 + rate)
        flow_params['initial'].reward_params['rl_action_penalty'] *= (
            1 + 0.2 * rate)
    elif safety < 1:
        flow_params['initial'].reward_params['rl_mean_speed'] *= (1 -
                                                                  0.05 * rate)
    flow_params['initial'].reward_params['unsafe_penalty'] *= (1 + rate)
    flow_params['initial'].reward_params['dc3_penalty'] *= (1 + rate)

    horizon = flow_params['env'].horizon
    alg_run = "PPO"
    agent_cls = get_agent_class(alg_run)
    config = deepcopy(agent_cls._default_config)
    config["num_workers"] = n_cpus
    config["horizon"] = horizon

    config["num_gpus"] = 1

    config["gamma"] = 0.99  # discount rate
    config["use_gae"] = True  # truncated
    config["lambda"] = 0.99  # truncated value
    config["kl_target"] = 0.02  # d_target
    config["num_sgd_iter"] = 15
    config["sgd_minibatch_size"] = 512
    # config['lr']=5e-7
    config['lr'] = 1e-6
    config["clip_param"] = 0.2

    config['train_batch_size'] = 3000
    config['rollout_fragment_length'] = 3000

    #common config
    config['framework'] = 'torch'
    config['callbacks'] = {
        "on_episode_end": None,
        "on_episode_start": None,
        "on_episode_step": None,
        "on_postprocess_traj": None,
        "on_sample_end": None,
        "on_train_result": None
    }
    # config["opt_type"]= "adam" for impala and APPO, default is SGD
    # TrainOneStep class call SGD -->execution_plan function can have policy update function
    print("cuda is available: ", torch.cuda.is_available())
    print('Beginning training.')
    print("==========================================")
    print("running algorithm: ", alg_run)  # "Framework: ", "torch"

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    # multiagent configuration
    if policy_graphs is not None:
        print("policy_graphs", policy_graphs)
        config['multiagent'].update({'policies': policy_graphs})
    if policy_mapping_fn is not None:
        config['multiagent'].update(
            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
    if policies_to_train is not None:
        config['multiagent'].update({'policies_to_train': policies_to_train})

    create_env, gym_name = make_create_env(params=flow_params)

    # Register as rllib env
    register_env(gym_name, create_env)
    return alg_run, gym_name, config
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--num-iters", type=int, default=2000)
    args = parser.parse_args()

    ray.init()
    trials = tune.run_experiments({
        "test": {
            "env": "CartPole-v0",
            "run": "PG",
            "stop": {
                "training_iteration": args.num_iters,
            },
            "config": {
                "callbacks": {
                    "on_episode_start": tune.function(on_episode_start),
                    "on_episode_step": tune.function(on_episode_step),
                    "on_episode_end": tune.function(on_episode_end),
                    "on_sample_end": tune.function(on_sample_end),
                    "on_train_result": tune.function(on_train_result),
                },
            },
        }
    })

    # verify custom metrics for integration tests
    custom_metrics = trials[0].last_result["custom_metrics"]
    print(custom_metrics)
    assert "pole_angle_mean" in custom_metrics
    assert "pole_angle_min" in custom_metrics
    assert "pole_angle_max" in custom_metrics
def tune_hyperparameters(model_type: str, experiment_group: str,
                         experiment_name: str):
    ray_num_cpus = 4
    num_cpus_per_process = 1
    num_gpus_per_process = 0.5

    ray.init(num_cpus=ray_num_cpus,
             ignore_reinit_error=True,
             include_webui=False)

    tuning_config_dir = root_dir('configs/%s/hp_tuning' % model_type)
    models_dir = root_dir('training/%s/hp_tuning/%s/%s' %
                          (model_type, experiment_group, experiment_name))
    ray_results_dir = root_dir('ray_results/%s' % experiment_group)

    # read the base config
    with open(os.path.join(tuning_config_dir, 'config.yaml')) as f:
        base_config = yaml.safe_load(f)

    # read mutations config
    with open(os.path.join(tuning_config_dir, 'mutations.yaml')) as f:
        mutations_grid = yaml.safe_load(f)

    # get mutated configs
    mutations = get_mutations(mutations_grid)

    # use only fraction of GPU
    session_config = None
    if num_gpus_per_process < 1:
        session_config = tf.ConfigProto()
        session_config.gpu_options.per_process_gpu_memory_fraction = num_gpus_per_process

    def tune_fn(tune_config, reporter):
        mutation = tune_config['mutation']

        # apply mutation to a base config
        config = mutate_config(base_config, mutation)

        # get model's directory
        model_dir = os.path.join(models_dir, generate_mutation_name(mutation))

        # save the config file to the model's directory
        write_model_config(model_dir, yaml.safe_dump(config))

        # train the model
        model_builder = create_builder(model_type, config)
        train(model_builder, model_dir, reporter, session_config)

    configuration = tune.Experiment(
        experiment_name,
        run=tune_fn,
        local_dir=ray_results_dir,
        config={
            'mutation': tune.grid_search(mutations),
        },
        trial_name_creator=tune.function(
            lambda trial: generate_mutation_name(trial.config['mutation'])),
        resources_per_trial={
            'cpu': num_cpus_per_process,
            'gpu': num_gpus_per_process,
        },
    )

    tune.run_experiments(configuration)