def launch_job(self, run_spec_path): # Copy the runspec to rl_nexus so that remote instances of rl_nexus can use it, # and so that users can inspect it after job launch. job_spec_name = 'launched_job_spec.yaml' copyfile(run_spec_path, job_spec_name) logger.write_line('------- Runspec ({}) copied to {}'.format( run_spec_path, job_spec_name)) self.job_launcher.launch_job(job_spec_name) logger.write_line( '------- Job launched. Use XT commands to access results.')
def __init__(self, spec_tree, run_spec_path, hp_handler): self.spec_tree = spec_tree self.hp_handler = hp_handler # Get spec values. self.cuda = spec_tree['cuda'] self.log_to_tensorboard = spec_tree['log_to_tensorboard'] self.experiment_path = spec_tree['experiment_path'] # Begin writing two copies of the console output. logger.add_output_file('console.txt') logger.add_output_file( os.path.join(self.experiment_path, 'console.txt')) # Check the experiment_path. if not self.experiment_path.startswith('../results'): logger.write_line( "WARNING: experiment_path \'{}\' (found in runspec) does not begin with '../results'. " "Job results will not be mirrored to Azure Storage.".format( self.experiment_path)) # Copy the launched runspec to results folder dest = pjoin(self.experiment_path, os.path.basename(run_spec_path)) if run_spec_path != dest: copyfile(run_spec_path, dest) # Is this app running as part of a launched job? in_job = os.getenv("XT_RUN_NAME") if in_job: # Yes. Don't create another job launcher. self.job_launcher = None else: # No. Try to instantiate a job launcher. self.job_launcher = spec_tree.create_component('job_launcher') if self.job_launcher and self.job_launcher.hp_tuning: self.hp_handler.write_hp_config_file() # Write the top portion of the repro spec tree to two files, # one in the rl_nexus dir, and the other in the experiment_path dir. local_repro_spec_path = 'repro_spec.yaml' exper_repro_spec_path = os.path.join(self.experiment_path, 'repro_spec.yaml') utils.ensure_dir_exists(file=exper_repro_spec_path) self.repro_spec_paths = (local_repro_spec_path, exper_repro_spec_path) self.write_to_repro_spec(self.spec_tree, '', 'w') self.write_to_repro_spec('\nprocessing_stages:\n', '', 'a')
def __init__(self, spec_tree, device): self.spec_tree = spec_tree self.device = device # Get spec values self.enabled = spec_tree['enabled'] self.save_model_to = resolve_path(spec_tree['save_model_to']) self.save_logs_to = resolve_path(spec_tree['save_logs_to']) self.max_iterations = spec_tree['max_iterations'] self.iters_per_report = spec_tree['iters_per_report'] self.get_action_from_env = spec_tree['get_action_from_env'] self.train = spec_tree['train'] self.render = spec_tree['render'] self.model_load_paths_dict = { 'load_model_from': resolve_path(spec_tree['load_model_from']), 'load_backbone_from': resolve_path(spec_tree['load_backbone_from']), 'load_core_from': resolve_path(spec_tree['load_core_from']), 'load_embedding_from': resolve_path(spec_tree['load_embedding_from']), 'load_head_from': resolve_path(spec_tree['load_head_from']) } self.model_save_paths_dict = { 'save_model_to': resolve_path(spec_tree['save_model_to']), 'save_backbone_to': resolve_path(spec_tree['save_backbone_to']), 'save_core_to': None, 'save_embedding_to': None, 'save_head_to': resolve_path(spec_tree['save_head_to']) } # Environment component self.environment = spec_tree.create_component('environment') # import pdb; pdb.set_trace() # Agent component self.agent = spec_tree.create_component('agent', self.environment.observation_space, self.environment.action_space, device) # import pdb; pdb.set_trace() # XT related self.xt_run_name = os.getenv("XT_RUN_NAME", None) self.xt_run = None if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun() # log hyperparameter values to XT. (in progress) #hd = cf.get_hparam_dict() #self.xt_run.log_hparams( hd ) if self.agent.loop_type() is not 'ray_loop': evaluation_num_episodes = spec_tree['evaluation_num_episodes'] assert evaluation_num_episodes == 0, 'Only rllib\'s algorithm implementations support intra-stage evaluation.' self.agent.load_model(self.model_load_paths_dict, True) if self.save_model_to: ensure_dir_exists(file=self.save_model_to) logger.write_line("Saving models to {}".format(self.save_model_to)) # Switch the agent into eval mode if requested if not self.train and not spec_tree['disable_eval']: if self.agent.model is not None: self.agent.model.eval() self.metric_data_list = []
def execute_processing_stages(self): # Choose the device for the processing stages to use. if self.cuda and not torch.cuda.is_available(): logger.write_line("WARNING: no GPU found! Failing over to cpu.") device = torch.device( "cuda" if self.cuda and torch.cuda.is_available() else "cpu") # Log the chosen hyperparameter values. self.hp_handler.log_chosen_values(logger) # Step through the processing stages. processing_stages = self.spec_tree['processing_stages'] stage_results = [] for idx, list_item in enumerate(processing_stages): logger.stage_num = idx + 1 st = '{} of {}'.format(logger.stage_num, len(processing_stages)) logger.write_line('Processing stage {}'.format(st)) self.write_to_repro_spec(' # {}\n'.format(st), '', 'a') processing_stage = self.spec_tree.create_component( list_item['processing_stage'], device) if not processing_stage: st = "(not enabled)\n" logger.write_line('{}'.format(st)) self.write_to_repro_spec(' # {}\n'.format(st), '', 'a') if processing_stage: # Write this processing stage to the repro spec. self.write_to_repro_spec(' - processing_stage:\n', '', 'a') self.write_to_repro_spec(processing_stage.spec_tree, ' ', 'a') self.write_to_repro_spec('\n', '', 'a') # Execute. logger.write_line('{}: Started'.format( processing_stage.component_name)) stage_result = processing_stage.execute() stage_results.append(stage_result) logger.write_line('{}: Completed\n'.format( processing_stage.component_name)) gc.collect() logger.finish_run(self.hp_handler.in_hp_search) logger.write_line('All processing stages completed.') return stage_results