예제 #1
0
 def _output_records(self, step_name, records):
     """Dump records."""
     columns = ["worker_id", "performance", "desc"]
     outputs = []
     for record in records:
         record = record.serialize()
         _record = {}
         for key in columns:
             _record[key] = record[key]
         outputs.append(deepcopy(_record))
     data = pd.DataFrame(outputs)
     step_path = FileOps.join_path(TaskOps().local_output_path, step_name)
     FileOps.make_dir(step_path)
     _file = FileOps.join_path(step_path, "output.csv")
     try:
         data.to_csv(_file, index=False)
     except Exception:
         logging.error("Failed to save output file, file={}".format(_file))
     for record in outputs:
         worker_id = record["worker_id"]
         worker_path = TaskOps().get_local_worker_path(step_name, worker_id)
         outputs_globs = []
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "desc_*.json"))
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "hps_*.json"))
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "model_*"))
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "performance_*.json"))
         for _file in outputs_globs:
             if os.path.isfile(_file):
                 FileOps.copy_file(_file, step_path)
             elif os.path.isdir(_file):
                 FileOps.copy_folder(_file, FileOps.join_path(step_path, os.path.basename(_file)))
예제 #2
0
    def save_report(self, records):
        """Save report to `reports.json`."""
        try:
            _file = FileOps.join_path(TaskOps().local_output_path,
                                      "reports.json")
            FileOps.make_base_dir(_file)
            data = {"_steps_": []}

            for step in self.step_names:
                if step in self.steps:
                    data["_steps_"].append(self.steps[step])
                else:
                    data["_steps_"].append({
                        "step_name": step,
                        "status": Status.unstarted
                    })

            for record in records:
                if record.step_name in data:
                    data[record.step_name].append(record.to_dict())
                else:
                    data[record.step_name] = [record.to_dict()]
            with open(_file, "w") as f:
                json.dump(data, f, indent=4, cls=JsonEncoder)
        except Exception:
            logging.warning(traceback.format_exc())
예제 #3
0
 def restore(cls):
     """Transfer cvs_file to records."""
     step_path = TaskOps().step_path
     _file = os.path.join(step_path, ".reports")
     if os.path.exists(_file):
         with open(_file, "rb") as f:
             data = pickle.load(f)
         cls._hist_records = data[0]
         cls.__instances__ = data[1]
예제 #4
0
 def pickle_report(self, records, report_instance):
     """Pickle report to `.reports`."""
     try:
         _file = os.path.join(TaskOps().step_path, ".reports")
         _dump_data = [records, report_instance]
         with open(_file, "wb") as f:
             pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL)
     except Exception:
         logging.warning(traceback.format_exc())
예제 #5
0
 def __init__(self, name=None, **kwargs):
     """Initialize pipestep."""
     self.task = TaskOps()
     self.name = name if name else "pipestep"
     self.start_time = datetime.now()
     self.status = Status.unstarted
     self.message = None
     self.end_time = None
     self.num_epochs = None
     self.num_models = None
예제 #6
0
 def _get_current_step_records(self):
     step_name = self.task.step_name
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     cur_index = PipelineConfig.steps.index(step_name)
     if cur_index >= 1 or models_folder:
         if not models_folder:
             models_folder = FileOps.join_path(
                 TaskOps().local_output_path,
                 PipelineConfig.steps[cur_index - 1])
         models_folder = models_folder.replace("{local_base_path}",
                                               TaskOps().local_base_path)
         records = ReportServer().load_records_from_model_folder(
             models_folder)
     else:
         records = [ReportRecord(step_name, 0)]
     logging.debug("Records: {}".format(records))
     for record in records:
         record.step_name = step_name
     return records
예제 #7
0
 def _show_pipeline_info(self):
     logging.info("-" * 48)
     logging.info("  Pipeline end.")
     logging.info("")
     logging.info("  task id: {}".format(General.task.task_id))
     logging.info("  output folder: {}".format(TaskOps().local_output_path))
     logging.info("")
     self._show_step_time()
     logging.info("")
     self._show_report()
     logging.info("-" * 48)
예제 #8
0
 def __init__(self, update_func=None):
     """Init master attrs, setup and start dask distributed cluster and local multiprocess pool."""
     self._checkout_cluster_existed()
     self.cfg = General()
     self.task_count = 0
     self.eval_count = General.worker.eval_count
     self.__master_path__ = FileOps.join_path(TaskOps().temp_path, "master")
     FileOps.make_dir(self.__master_path__)
     self.dask_env = DaskEnv(General.env,
                             self.__master_path__,
                             General.devices_per_trainer,
                             TaskOps().temp_path)
     status = self.dask_env.start()
     if not status or not self.dask_env.is_master:
         sys.exit(0)
     self._start_cluster()
     self.t_queue = Queue()
     self.update_func = update_func
     self._thread_runing = True
     self._lock = Lock()
     self._thread = self._run_monitor_thread()
     return
예제 #9
0
 def _get_search_space_list(self):
     """Get search space list from models folder."""
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     if not models_folder:
         self.search_space_list = None
         return
     self.search_space_list = []
     models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path)
     pattern = FileOps.join_path(models_folder, "*.json")
     files = glob.glob(pattern)
     for file in files:
         with open(file) as f:
             self.search_space_list.append(json.load(f))
예제 #10
0
    def __init__(self):
        """Initialize Visual callback."""
        super(VisualCallBack, self).__init__()
        self.priority = 290
        self._archive_root = TaskOps().local_visual_path
        self._fix_path = None
        self.summary = None
        self.writer = None

        self.input = None
        self.model = None

        self._need_keys = {"loss_avg", "lr"}
        self._info = {k: 0. for k in self._need_keys}
예제 #11
0
 def _show_report(self):
     performance_file = FileOps.join_path(TaskOps().local_output_path,
                                          self.steps[-1].name, "output.csv")
     try:
         data = pd.read_csv(performance_file)
     except Exception:
         logging.info("  result file output.csv is not existed or empty")
         return
     if data.shape[1] < 2 or data.shape[0] == 0:
         logging.info("  result file output.csv is empty")
         return
     logging.info("  result:")
     data = json.loads(data.to_json())
     for key in data["worker_id"].keys():
         logging.info("  {:>3s}:  {}".format(str(data["worker_id"][key]),
                                             data["performance"][key]))
예제 #12
0
 def _start_cluster(self):
     """Set and start dask distributed cluster."""
     self.md = ClusterDaskDistributor(self.dask_env.master_address)
     self.client = self.md.get_client()
     local_host = None
     if "BATCH_CURRENT_HOST" in os.environ:
         local_host = os.environ["BATCH_CURRENT_HOST"]
     elif "BATCH_CUSTOM0_HOSTS" in os.environ:
         local_host = os.environ["BATCH_CUSTOM0_HOSTS"]
     if "CUDA_VISIBLE_DEVICES" in os.environ:
         os.environ["ORIGIN_CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"]
     self._remove_worker_number_file()
     plugin = WorkerEnv(self.dask_env.slave_proc_num,
                        self.dask_env.slave_device_num_per_proc,
                        local_host,
                        os.getpid(),
                        TaskOps().temp_path)
     self.client.register_worker_plugin(plugin)
     if "ORIGIN_CUDA_VISIBLE_DEVICES" in os.environ:
         os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["ORIGIN_CUDA_VISIBLE_DEVICES"]
     if "CUDA_VISIBLE_DEVICES" in os.environ and "ORIGIN_CUDA_VISIBLE_DEVICES" not in os.environ:
         del os.environ["CUDA_VISIBLE_DEVICES"]
     return
예제 #13
0
 def _remove_worker_number_file(self):
     _worker_number_file = os.path.join(TaskOps().temp_path, ".*worker_number")
     files = glob.glob(_worker_number_file)
     for _file in files:
         os.remove(_file)
예제 #14
0
 def backup_output_path(self):
     """Back up output to local path."""
     backup_path = TaskOps().backup_base_path
     if backup_path is None:
         return
     FileOps.copy_folder(TaskOps().local_output_path, backup_path)