Пример #1
0
 def _output_records(self, step_name, records):
     """Dump records."""
     columns = ["worker_id", "performance", "desc"]
     outputs = []
     for record in records:
         record = record.serialize()
         _record = {}
         for key in columns:
             _record[key] = record[key]
         outputs.append(deepcopy(_record))
     data = pd.DataFrame(outputs)
     step_path = FileOps.join_path(TaskOps().local_output_path, step_name)
     FileOps.make_dir(step_path)
     _file = FileOps.join_path(step_path, "output.csv")
     try:
         data.to_csv(_file, index=False)
     except Exception:
         logging.error("Failed to save output file, file={}".format(_file))
     for record in outputs:
         worker_id = record["worker_id"]
         worker_path = TaskOps().get_local_worker_path(step_name, worker_id)
         outputs_globs = []
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "desc_*.json"))
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "hps_*.json"))
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "model_*"))
         outputs_globs += glob.glob(FileOps.join_path(worker_path, "performance_*.json"))
         for _file in outputs_globs:
             if os.path.isfile(_file):
                 FileOps.copy_file(_file, step_path)
             elif os.path.isdir(_file):
                 FileOps.copy_folder(_file, FileOps.join_path(step_path, os.path.basename(_file)))
Пример #2
0
    def save_results(self):
        """Save the results of evolution contains the information of pupulation and elitism."""
        _path = FileOps.join_path(self.local_output_path, General.step_name)
        FileOps.make_dir(_path)
        arch_file = FileOps.join_path(_path, 'arch.txt')
        arch_child = FileOps.join_path(_path, 'arch_child.txt')
        sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy')
        sel_arch = []
        with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac:
            writer_a = csv.writer(fw_a, lineterminator='\n')
            writer_ac = csv.writer(fw_ac, lineterminator='\n')
            writer_ac.writerow(
                ['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.individual_num):
                writer_ac.writerow(
                    self._log_data(net_info_type='active_only',
                                   pop=self.pop[c],
                                   value=self.pop[c].fitness))

            writer_a.writerow(
                ['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.elitism_num):
                writer_a.writerow(
                    self._log_data(net_info_type='active_only',
                                   pop=self.elitism[c],
                                   value=self.elit_fitness[c]))
                sel_arch.append(self.elitism[c].gene)
        sel_arch = np.stack(sel_arch)
        np.save(sel_arch_file, sel_arch)
        if self.backup_base_path is not None:
            FileOps.copy_folder(self.local_output_path, self.backup_base_path)
Пример #3
0
    def _save_checkpoint(self, epoch, best=False):
        """Save model weights.

        :param epoch: current epoch
        :type epoch: int
        """
        save_dir = os.path.join(self.worker_path, str(epoch))
        FileOps.make_dir(save_dir)
        for name in self.model.model_names:
            if isinstance(name, str):
                save_filename = '%s_net_%s.pth' % (epoch, name)
                save_path = FileOps.join_path(save_dir, save_filename)
                net = getattr(self.model, 'net' + name)
                best_file = FileOps.join_path(self.worker_path,
                                              "model_{}.pth".format(name))
                if vega.is_gpu_device() and torch.cuda.is_available():
                    # torch.save(net.module.cpu().state_dict(), save_path)
                    torch.save(net.module.state_dict(), save_path)
                    # net.cuda()
                    if best:
                        torch.save(net.module.state_dict(), best_file)
                elif vega.is_npu_device():
                    torch.save(net.state_dict(), save_path)
                    if best:
                        torch.save(net.state_dict(), best_file)
                else:
                    torch.save(net.cpu().state_dict(), save_path)
                    if best:
                        torch.save(net.cpu().state_dict(), best_file)
    def before_train(self, logs=None):
        """Be called before the whole train process."""
        self.trainer.config.call_metrics_on_train = False
        self.cfg = self.trainer.config
        self.worker_id = self.trainer.worker_id
        self.local_base_path = self.trainer.local_base_path
        self.local_output_path = self.trainer.local_output_path

        self.result_path = FileOps.join_path(self.trainer.local_base_path,
                                             "result")
        FileOps.make_dir(self.result_path)
        self.logger_patch()
Пример #5
0
    def update(self, record):
        """Update current performance into hpo score board.

        :param hps: hyper parameters need to update
        :param performance:  trainer performance
        """
        super().update(record)
        config_id = str(record.get('worker_id'))
        step_name = record.get('step_name')
        worker_result_path = self.get_local_worker_path(step_name, config_id)
        new_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', config_id, 'checkpoint')
        FileOps.make_dir(worker_result_path)
        FileOps.make_dir(new_worker_result_path)
        if os.path.exists(new_worker_result_path):
            shutil.rmtree(new_worker_result_path)
        shutil.copytree(worker_result_path, new_worker_result_path)
Пример #6
0
def load_master_ip():
    """Get the ip and port that write in a system path.

    here will not download anything from S3.
    """
    temp_folder = TaskOps().temp_path
    FileOps.make_dir(temp_folder)
    file_path = os.path.join(temp_folder, 'ip_address.txt')
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            ip = f.readline().strip()
            port = f.readline().strip()
            logging.info("get write ip, ip={}, port={}".format(ip, port))
            return ip, port
    else:
        return None, None
Пример #7
0
def save_master_ip(ip_address, port, args):
    """Write the ip and port in a system path.

    :param str ip_address: The `ip_address` need to write.
    :param str port: The `port` need to write.
    :param argparse.ArgumentParser args: `args` is a argparse that should
         contain `init_method`, `rank` and `world_size`.

    """
    temp_folder = TaskOps().temp_path
    FileOps.make_dir(temp_folder)
    file_path = os.path.join(temp_folder, 'ip_address.txt')
    logging.info("write ip, file path={}".format(file_path))
    with open(file_path, 'w') as f:
        f.write(ip_address + "\n")
        f.write(port + "\n")
Пример #8
0
 def _saved_multi_checkpoint(self, epoch):
     """Save multi tasks checkpoint."""
     FileOps.make_dir(self.trainer.get_local_worker_path(),
                      self.trainer.multi_task)
     checkpoint_file = FileOps.join_path(
         self.trainer.get_local_worker_path(), self.trainer.multi_task,
         self.trainer.checkpoint_file_name)
     logging.debug("Start Save Multi Task Model, model_file=%s",
                   self.trainer.model_pickle_file_name)
     if vega.is_torch_backend():
         ckpt = {
             'epoch': epoch,
             'weight': self.trainer.model.state_dict(),
             'optimizer': self.trainer.optimizer.state_dict(),
             'lr_scheduler': self.trainer.lr_scheduler.state_dict(),
         }
         torch.save(ckpt, checkpoint_file)
     self.trainer.checkpoint_file = checkpoint_file
Пример #9
0
 def search(self):
     """Search an id and hps from hpo."""
     sample = self.hpo.propose()
     if sample is None:
         return None
     re_hps = {}
     sample = copy.deepcopy(sample)
     sample_id = sample.get('config_id')
     trans_para = sample.get('configs')
     rung_id = sample.get('rung_id')
     all_para = sample.get('all_configs')
     re_hps['dataset.transforms'] = [{'type': 'PBATransformer', 'para_array': trans_para, 'all_para': all_para,
                                      'operation_names': self.operation_names}]
     checkpoint_path = FileOps.join_path(self.local_base_path, 'worker', 'cache', str(sample_id), 'checkpoint')
     FileOps.make_dir(checkpoint_path)
     if os.path.exists(checkpoint_path):
         re_hps['trainer.checkpoint_path'] = checkpoint_path
     if 'epoch' in sample:
         re_hps['trainer.epochs'] = sample.get('epoch')
     return dict(worker_id=sample_id, encoded_desc=re_hps, rung_id=rung_id)
Пример #10
0
 def _init_next_rung(self):
     """Init next rung to search."""
     next_rung_id = self.rung_id + 1
     if next_rung_id >= self.total_rungs:
         self.rung_id = self.rung_id + 1
         return
     for i in range(self.config_count):
         self.all_config_dict[i][next_rung_id] = self.all_config_dict[i][
             self.rung_id]
     current_score = []
     for i in range(self.config_count):
         current_score.append((i, self.best_score_dict[self.rung_id][i]))
     current_score.sort(key=lambda current_score: current_score[1])
     for i in range(4):
         better_id = current_score[self.config_count - 1 - i][0]
         worse_id = current_score[i][0]
         better_worker_result_path = FileOps.join_path(
             self.local_base_path, 'cache', 'pba', str(better_id),
             'checkpoint')
         FileOps.make_dir(better_worker_result_path)
         worse_worker_result_path = FileOps.join_path(
             self.local_base_path, 'cache', 'pba', str(worse_id),
             'checkpoint')
         FileOps.make_dir(worse_worker_result_path)
         shutil.rmtree(worse_worker_result_path)
         shutil.copytree(better_worker_result_path,
                         worse_worker_result_path)
         self.all_config_dict[worse_id] = self.all_config_dict[better_id]
         policy_unchange = self.all_config_dict[worse_id][next_rung_id]
         policy_changed = self.explore(policy_unchange)
         self.all_config_dict[worse_id][next_rung_id] = policy_changed
     for id in range(self.config_count):
         self.best_score_dict[next_rung_id][id] = -1 * float('inf')
         tmp_row_data = {
             'config_id': id,
             'rung_id': next_rung_id,
             'status': StatusType.WAITTING
         }
         self._add_to_board(tmp_row_data)
     self.rung_id = self.rung_id + 1
Пример #11
0
 def __init__(self, update_func=None):
     """Init master attrs, setup and start dask distributed cluster and local multiprocess pool."""
     self._checkout_cluster_existed()
     self.cfg = General()
     self.task_count = 0
     self.eval_count = General.worker.eval_count
     self.__master_path__ = FileOps.join_path(TaskOps().temp_path, "master")
     FileOps.make_dir(self.__master_path__)
     self.dask_env = DaskEnv(General.env,
                             self.__master_path__,
                             General.devices_per_trainer,
                             TaskOps().temp_path)
     status = self.dask_env.start()
     if not status or not self.dask_env.is_master:
         sys.exit(0)
     self._start_cluster()
     self.t_queue = Queue()
     self.update_func = update_func
     self._thread_runing = True
     self._lock = Lock()
     self._thread = self._run_monitor_thread()
     return
Пример #12
0
    def search(self):
        """Search an id and hps from hpo."""
        sample = self.hpo.propose()
        if sample is None:
            return None
        re_hps = {}
        sample = copy.deepcopy(sample)
        sample_id = sample.get('config_id')
        cur_configs = sample.get('configs')
        all_configs = sample.get("all_configs")
        rung_id = sample.get('rung_id')

        checkpoint_path = FileOps.join_path(self.local_base_path, 'cache', str(sample_id), 'checkpoint')
        FileOps.make_dir(checkpoint_path)
        if os.path.exists(checkpoint_path):
            re_hps['trainer.checkpoint_path'] = checkpoint_path
        if 'epoch' in sample:
            re_hps['trainer.epochs'] = sample.get('epoch')
        re_hps.update(cur_configs)
        re_hps['trainer.all_configs'] = all_configs
        logging.info("Current rung [ {} /{}] ".format(rung_id, self.config.policy.total_rungs))
        return dict(worker_id=sample_id, encoded_desc=re_hps, rung_id=rung_id)
Пример #13
0
 def _save_best_model(self):
     """Save best model."""
     if vega.is_torch_backend():
         torch.save(self.trainer.model.state_dict(),
                    self.trainer.weights_file)
     elif vega.is_tf_backend():
         worker_path = self.trainer.get_local_worker_path()
         model_id = "model_{}".format(self.trainer.worker_id)
         weights_folder = FileOps.join_path(worker_path, model_id)
         FileOps.make_dir(weights_folder)
         checkpoint_file = tf.train.latest_checkpoint(worker_path)
         ckpt_globs = glob.glob("{}.*".format(checkpoint_file))
         for _file in ckpt_globs:
             FileOps.copy_file(
                 _file,
                 FileOps.join_path(weights_folder,
                                   os.path.split(_file)[-1]))
         FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'),
                           weights_folder)
         if self.trainer.save_ext_model:
             self._save_pb_model(weights_folder, model_id)
             self.trainer.ext_model = FileOps.join_path(
                 weights_folder, '{}.pb'.format(model_id))
     elif vega.is_ms_backend():
         worker_path = self.trainer.get_local_worker_path()
         save_path = os.path.join(
             worker_path, "model_{}.ckpt".format(self.trainer.worker_id))
         for file in os.listdir(worker_path):
             if file.startswith("CKP") and file.endswith(".ckpt"):
                 self.weights_file = FileOps.join_path(worker_path, file)
                 os.rename(self.weights_file, save_path)
         if self.trainer.save_ext_model:
             model_id = "model_{}".format(self.trainer.worker_id)
             self._save_om_model(worker_path, model_id)
             self.trainer.ext_model = FileOps.join_path(
                 worker_path, '{}.om'.format(model_id))