def from_defaults(args): if args.agent: agent_cls = R.lookup_agent(args.agent) agent_args = agent_cls.args else: h = R.lookup_actor(args.actor_host) w = R.lookup_actor(args.actor_worker) l = R.lookup_learner(args.learner) e = R.lookup_exp(args.exp) agent_args = {**h.args, **w.args, **l.args, **e.args} env_cls = R.lookup_env(args.env) rwdnorm_cls = R.lookup_reward_normalizer(args.rwd_norm) env_args = env_cls.args rwdnorm_args = rwdnorm_cls.args if args.custom_network: net_args = R.lookup_network(args.custom_network).args else: net_args = R.lookup_modular_args(args) args = DotDict({ **args, **agent_args, **env_args, **rwdnorm_args, **net_args }) return args
def from_prompt(args): if args.agent: agent_cls = R.lookup_agent(args.agent) agent_args = agent_cls.prompt(provided=args) else: h = R.lookup_actor(args.actor_host) w = R.lookup_actor(args.actor_worker) l = R.lookup_learner(args.learner) e = R.lookup_exp(args.exp) agent_args = { **h.prompt(args), **w.prompt(args), **l.prompt(args), **e.prompt(args), } env_cls = R.lookup_env(args.env) rwdnorm_cls = R.lookup_reward_normalizer(args.rwd_norm) env_args = env_cls.prompt(provided=args) rwdnorm_args = rwdnorm_cls.prompt(provided=args) if args.custom_network: net_args = R.lookup_network(args.custom_network).prompt() else: net_args = R.prompt_modular_args(args) args = DotDict({ **args, **agent_args, **env_args, **rwdnorm_args, **net_args }) return args
def __init__(self, args, log_id_dir, initial_step_count, rank): seed = args.seed \ if rank == 0 \ else args.seed + args.nb_env * rank print('Worker {} using seed {}'.format(rank, seed)) # load saved registry classes REGISTRY.load_extern_classes(log_id_dir) # ENV engine = REGISTRY.lookup_engine(args.env) env_cls = REGISTRY.lookup_env(args.env) mgr_cls = REGISTRY.lookup_manager(args.manager) env_mgr = mgr_cls.from_args(args, engine, env_cls, seed=seed) # NETWORK torch.manual_seed(args.seed) device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu") output_space = REGISTRY.lookup_output_space(args.actor_worker, env_mgr.action_space) if args.custom_network: net_cls = REGISTRY.lookup_network(args.custom_network) else: net_cls = ModularNetwork net = net_cls.from_args(args, env_mgr.observation_space, output_space, env_mgr.gpu_preprocessor, REGISTRY) actor_cls = REGISTRY.lookup_actor(args.actor_worker) actor = actor_cls.from_args(args, env_mgr.action_space) builder = actor_cls.exp_spec_builder(env_mgr.observation_space, env_mgr.action_space, net.internal_space(), env_mgr.nb_env) exp = REGISTRY.lookup_exp(args.exp).from_args(args, builder) self.actor = actor self.exp = exp.to(device) self.nb_step = args.nb_step self.env_mgr = env_mgr self.nb_env = args.nb_env self.network = net.to(device) self.device = device self.initial_step_count = initial_step_count # TODO: this should be set to eval after some number of training steps self.network.train() # SETUP state variables for run self.step_count = self.initial_step_count self.global_step_count = self.initial_step_count self.ep_rewards = torch.zeros(self.nb_env) self.rank = rank self.obs = dtensor_to_dev(self.env_mgr.reset(), self.device) self.internals = listd_to_dlist([ self.network.new_internals(self.device) for _ in range(self.nb_env) ]) self.start_time = time() self._weights_synced = False
def __init__( self, eval_actor, epoch_id, logger, log_id_dir, gpu_id, nb_episode, start, end, seed, manager ): self.log_dir_helper = log_dir_helper = LogDirHelper(log_id_dir) self.train_args = train_args = log_dir_helper.load_args() self.device = device = self._device_from_gpu_id(gpu_id) self.logger = logger if epoch_id: epoch_ids = [epoch_id] else: epoch_ids = self.log_dir_helper.epochs() epoch_ids = filter(lambda eid: eid >= start, epoch_ids) if end != -1.: epoch_ids = filter(lambda eid: eid <= end, epoch_ids) epoch_ids = list(epoch_ids) self.epoch_ids = epoch_ids engine = REGISTRY.lookup_engine(train_args.env) env_cls = REGISTRY.lookup_env(train_args.env) mgr_cls = REGISTRY.lookup_manager(manager) self.env_mgr = env_mgr = SubProcEnvManager.from_args( self.train_args, engine, env_cls, seed=seed, nb_env=nb_episode ) if train_args.agent: agent = train_args.agent else: agent = train_args.actor_host output_space = REGISTRY.lookup_output_space( agent, env_mgr.action_space ) actor_cls = REGISTRY.lookup_actor(eval_actor) self.actor = actor_cls.from_args( actor_cls.prompt(), env_mgr.action_space ) self.network = self._init_network( train_args, env_mgr.observation_space, env_mgr.gpu_preprocessor, output_space, REGISTRY ).to(device)
def __init__( self, args, log_id_dir, initial_step_count, rank=0, ): # ARGS TO STATE VARS self._args = args self.nb_learners = args.nb_learners self.nb_workers = args.nb_workers self.rank = rank self.nb_step = args.nb_step self.nb_env = args.nb_env self.initial_step_count = initial_step_count self.epoch_len = args.epoch_len self.summary_freq = args.summary_freq self.nb_learn_batch = args.nb_learn_batch self.rollout_queue_size = args.rollout_queue_size # can be none if rank != 0 self.log_id_dir = log_id_dir # load saved registry classes REGISTRY.load_extern_classes(log_id_dir) # ENV (temporary) env_cls = REGISTRY.lookup_env(args.env) env = env_cls.from_args(args, 0) env_action_space, env_observation_space, env_gpu_preprocessor = \ env.action_space, env.observation_space, env.gpu_preprocessor env.close() # NETWORK torch.manual_seed(args.seed) device = torch.device("cuda") # ray handles gpus torch.backends.cudnn.benchmark = True output_space = REGISTRY.lookup_output_space( args.actor_worker, env_action_space) if args.custom_network: net_cls = REGISTRY.lookup_network(args.custom_network) else: net_cls = ModularNetwork net = net_cls.from_args( args, env_observation_space, output_space, env_gpu_preprocessor, REGISTRY ) self.network = net.to(device) # TODO: this is a hack, remove once queuer puts rollouts on the correct device self.network.device = device self.device = device self.network.train() # OPTIMIZER def optim_fn(x): return torch.optim.RMSprop(x, lr=args.lr, eps=1e-5, alpha=0.99) if args.nb_learners > 1: self.optimizer = NCCLOptimizer(optim_fn, self.network, self.nb_learners) else: self.optimizer = optim_fn(self.network.parameters()) # LEARNER / EXP rwd_norm = REGISTRY.lookup_reward_normalizer( args.rwd_norm).from_args(args) actor_cls = REGISTRY.lookup_actor(args.actor_host) builder = actor_cls.exp_spec_builder( env.observation_space, env.action_space, net.internal_space(), args.nb_env * args.nb_learn_batch ) w_builder = REGISTRY.lookup_actor(args.actor_worker).exp_spec_builder( env.observation_space, env.action_space, net.internal_space(), args.nb_env ) actor = actor_cls.from_args(args, env.action_space) learner = REGISTRY.lookup_learner(args.learner).from_args(args, rwd_norm) exp_cls = REGISTRY.lookup_exp(args.exp).from_args(args, builder) self.actor = actor self.learner = learner self.exp = exp_cls.from_args(args, builder).to(device) # Rank 0 setup, load network/optimizer and create SummaryWriter/Saver if rank == 0: if args.load_network: self.network = self.load_network(self.network, args.load_network) print('Reloaded network from {}'.format(args.load_network)) if args.load_optim: self.optimizer = self.load_optim(self.optimizer, args.load_optim) print('Reloaded optimizer from {}'.format(args.load_optim)) print('Network parameters: ' + str(self.count_parameters(net))) self.summary_writer = SummaryWriter(log_id_dir) self.saver = SimpleModelSaver(log_id_dir)