예제 #1
0
 def launch_job(self, run_spec_path):
     # Copy the runspec to rl_nexus so that remote instances of rl_nexus can use it,
     # and so that users can inspect it after job launch.
     job_spec_name = 'launched_job_spec.yaml'
     copyfile(run_spec_path, job_spec_name)
     logger.write_line('------- Runspec ({}) copied to {}'.format(
         run_spec_path, job_spec_name))
     self.job_launcher.launch_job(job_spec_name)
     logger.write_line(
         '------- Job launched. Use XT commands to access results.')
예제 #2
0
    def __init__(self, spec_tree, run_spec_path, hp_handler):
        self.spec_tree = spec_tree
        self.hp_handler = hp_handler

        # Get spec values.
        self.cuda = spec_tree['cuda']
        self.log_to_tensorboard = spec_tree['log_to_tensorboard']
        self.experiment_path = spec_tree['experiment_path']

        # Begin writing two copies of the console output.
        logger.add_output_file('console.txt')
        logger.add_output_file(
            os.path.join(self.experiment_path, 'console.txt'))

        # Check the experiment_path.
        if not self.experiment_path.startswith('../results'):
            logger.write_line(
                "WARNING: experiment_path \'{}\' (found in runspec) does not begin with '../results'. "
                "Job results will not be mirrored to Azure Storage.".format(
                    self.experiment_path))

        # Copy the launched runspec to results folder
        dest = pjoin(self.experiment_path, os.path.basename(run_spec_path))
        if run_spec_path != dest:
            copyfile(run_spec_path, dest)

        # Is this app running as part of a launched job?
        in_job = os.getenv("XT_RUN_NAME")
        if in_job:
            # Yes. Don't create another job launcher.
            self.job_launcher = None
        else:
            # No. Try to instantiate a job launcher.
            self.job_launcher = spec_tree.create_component('job_launcher')
            if self.job_launcher and self.job_launcher.hp_tuning:
                self.hp_handler.write_hp_config_file()

        # Write the top portion of the repro spec tree to two files,
        # one in the rl_nexus dir, and the other in the experiment_path dir.
        local_repro_spec_path = 'repro_spec.yaml'
        exper_repro_spec_path = os.path.join(self.experiment_path,
                                             'repro_spec.yaml')
        utils.ensure_dir_exists(file=exper_repro_spec_path)
        self.repro_spec_paths = (local_repro_spec_path, exper_repro_spec_path)
        self.write_to_repro_spec(self.spec_tree, '', 'w')
        self.write_to_repro_spec('\nprocessing_stages:\n', '', 'a')
예제 #3
0
    def __init__(self, spec_tree, device):
        self.spec_tree = spec_tree
        self.device    = device

        # Get spec values
        self.enabled              = spec_tree['enabled']
        self.save_model_to        = resolve_path(spec_tree['save_model_to'])
        self.save_logs_to         = resolve_path(spec_tree['save_logs_to'])
        self.max_iterations       = spec_tree['max_iterations']
        self.iters_per_report     = spec_tree['iters_per_report']
        self.get_action_from_env  = spec_tree['get_action_from_env']
        self.train                = spec_tree['train']
        self.render               = spec_tree['render']
        self.model_load_paths_dict = {
            'load_model_from':      resolve_path(spec_tree['load_model_from']),
            'load_backbone_from':   resolve_path(spec_tree['load_backbone_from']),
            'load_core_from':       resolve_path(spec_tree['load_core_from']),
            'load_embedding_from':  resolve_path(spec_tree['load_embedding_from']),
            'load_head_from':       resolve_path(spec_tree['load_head_from'])
        }
        self.model_save_paths_dict = {
            'save_model_to':        resolve_path(spec_tree['save_model_to']),
            'save_backbone_to':     resolve_path(spec_tree['save_backbone_to']),
            'save_core_to':         None,
            'save_embedding_to':    None,
            'save_head_to':         resolve_path(spec_tree['save_head_to'])
        }

        # Environment component
        self.environment = spec_tree.create_component('environment')
        # import pdb; pdb.set_trace()
        # Agent component
        self.agent = spec_tree.create_component('agent',
                                                self.environment.observation_space,
                                                self.environment.action_space,
                                                device)
        # import pdb; pdb.set_trace()
        # XT related
        self.xt_run_name = os.getenv("XT_RUN_NAME", None)
        self.xt_run = None
        if self.xt_run_name:
            from xtlib.run import Run as XTRun
            self.xt_run = XTRun()
            # log hyperparameter values to XT. (in progress)
            #hd = cf.get_hparam_dict()
            #self.xt_run.log_hparams( hd )

        if self.agent.loop_type() is not 'ray_loop':
            evaluation_num_episodes = spec_tree['evaluation_num_episodes']
            assert evaluation_num_episodes == 0, 'Only rllib\'s algorithm implementations support intra-stage evaluation.' 
            self.agent.load_model(self.model_load_paths_dict, True)

        if self.save_model_to:
            ensure_dir_exists(file=self.save_model_to)
            logger.write_line("Saving models to {}".format(self.save_model_to))

        # Switch the agent into eval mode if requested
        if not self.train and not spec_tree['disable_eval']:
            if self.agent.model is not None:
                self.agent.model.eval()

        self.metric_data_list = []
예제 #4
0
    def execute_processing_stages(self):
        # Choose the device for the processing stages to use.
        if self.cuda and not torch.cuda.is_available():
            logger.write_line("WARNING: no GPU found! Failing over to cpu.")
        device = torch.device(
            "cuda" if self.cuda and torch.cuda.is_available() else "cpu")

        # Log the chosen hyperparameter values.
        self.hp_handler.log_chosen_values(logger)

        # Step through the processing stages.
        processing_stages = self.spec_tree['processing_stages']
        stage_results = []
        for idx, list_item in enumerate(processing_stages):
            logger.stage_num = idx + 1

            st = '{} of {}'.format(logger.stage_num, len(processing_stages))
            logger.write_line('Processing stage {}'.format(st))
            self.write_to_repro_spec('  # {}\n'.format(st), '', 'a')

            processing_stage = self.spec_tree.create_component(
                list_item['processing_stage'], device)
            if not processing_stage:
                st = "(not enabled)\n"
                logger.write_line('{}'.format(st))
                self.write_to_repro_spec('  # {}\n'.format(st), '', 'a')

            if processing_stage:
                # Write this processing stage to the repro spec.
                self.write_to_repro_spec('  - processing_stage:\n', '', 'a')
                self.write_to_repro_spec(processing_stage.spec_tree, '      ',
                                         'a')
                self.write_to_repro_spec('\n', '', 'a')

                # Execute.
                logger.write_line('{}:  Started'.format(
                    processing_stage.component_name))
                stage_result = processing_stage.execute()
                stage_results.append(stage_result)
                logger.write_line('{}:  Completed\n'.format(
                    processing_stage.component_name))
                gc.collect()
        logger.finish_run(self.hp_handler.in_hp_search)
        logger.write_line('All processing stages completed.')
        return stage_results