Exemplo n.º 1
0
    def _save_checkpoint(self, epoch, best=False):
        """Save model weights.

        :param epoch: current epoch
        :type epoch: int
        """
        save_dir = os.path.join(self.worker_path, str(epoch))
        FileOps.make_dir(save_dir)
        for name in self.model.model_names:
            if isinstance(name, str):
                save_filename = '%s_net_%s.pth' % (epoch, name)
                save_path = FileOps.join_path(save_dir, save_filename)
                net = getattr(self.model, 'net' + name)
                best_file = FileOps.join_path(self.worker_path,
                                              "model_{}.pth".format(name))
                if self.cfg.cuda and torch.cuda.is_available():
                    # torch.save(net.module.cpu().state_dict(), save_path)
                    torch.save(net.module.state_dict(), save_path)
                    # net.cuda()
                    if best:
                        torch.save(net.module.state_dict(), best_file)
                else:
                    torch.save(net.cpu().state_dict(), save_path)
                    if best:
                        torch.save(net.cpu().state_dict(), best_file)
Exemplo n.º 2
0
Arquivo: pba.py Projeto: ylfzr/vega
 def _init_next_rung(self):
     """Init next rung to search."""
     next_rung_id = self.rung_id + 1
     if next_rung_id >= self.total_rungs:
         self.rung_id = self.rung_id + 1
         return
     for i in range(self.config_count):
         self.all_config_dict[i][next_rung_id] = self.all_config_dict[i][self.rung_id]
     current_score = []
     for i in range(self.config_count):
         current_score.append((i, self.best_score_dict[self.rung_id][i]))
     current_score.sort(key=lambda current_score: current_score[1])
     for i in range(4):
         better_id = current_score[self.config_count - 1 - i][0]
         worse_id = current_score[i][0]
         better_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba',
                                                       str(better_id), 'checkpoint')
         FileOps.make_dir(better_worker_result_path)
         worse_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba',
                                                      str(worse_id), 'checkpoint')
         FileOps.make_dir(worse_worker_result_path)
         shutil.rmtree(worse_worker_result_path)
         shutil.copytree(better_worker_result_path, worse_worker_result_path)
         self.all_config_dict[worse_id] = self.all_config_dict[better_id]
         policy_unchange = self.all_config_dict[worse_id][next_rung_id]
         policy_changed = self.explore(policy_unchange)
         self.all_config_dict[worse_id][next_rung_id] = policy_changed
     for id in range(self.config_count):
         self.best_score_dict[next_rung_id][id] = -1 * float('inf')
         tmp_row_data = {'config_id': id,
                         'rung_id': next_rung_id,
                         'status': StatusType.WAITTING}
         self._add_to_board(tmp_row_data)
     self.rung_id = self.rung_id + 1
Exemplo n.º 3
0
def dump_model_visual_info(trainer, epoch, model, inputs):
    """Dump model to tensorboard event files.

    :param trainer: trainer.
    :type worker: object that the class was inherited from DistributedWorker.
    :param model: model.
    :type model: model.
    :param inputs: input data.
    :type inputs: data.

    """
    (_, visual, interval, title, worker_id, output_path) = _get_trainer_info(trainer)
    if visual is not True:
        return
    if epoch % interval != 0:
        return
    title = str(worker_id)
    _path = FileOps.join_path(output_path, title)
    FileOps.make_dir(_path)
    try:
        with SummaryWriter(_path) as writer:
            writer.add_graph(model, (inputs,))
    except Exception as e:
        logging.error("Failed to dump model visual info, worker id: {}, epoch: {}, error: {}".format(
            worker_id, epoch, str(e)
        ))
Exemplo n.º 4
0
 def search(self):
     """Search an id and hps from hpo."""
     sample = self.hpo.propose()
     if sample is None:
         return None
     re_hps = {}
     sample = copy.deepcopy(sample)
     sample_id = sample.get('config_id')
     trans_para = sample.get('configs')
     rung_id = sample.get('rung_id')
     all_para = sample.get('all_configs')
     re_hps['dataset.transforms'] = [{
         'type': 'PBATransformer',
         'para_array': trans_para,
         'all_para': all_para,
         'operation_names': self.operation_names
     }]
     checkpoint_path = FileOps.join_path(self.local_base_path, 'worker',
                                         'cache', str(sample_id),
                                         'checkpoint')
     FileOps.make_dir(checkpoint_path)
     if os.path.exists(checkpoint_path):
         re_hps['trainer.checkpoint_path'] = checkpoint_path
     if 'epoch' in sample:
         re_hps['trainer.epochs'] = sample.get('epoch')
     return dict(worker_id=sample_id, encoded_desc=re_hps, rung_id=rung_id)
Exemplo n.º 5
0
 def _save_best_model(self):
     """Save best model."""
     if zeus.is_torch_backend():
         torch.save(self.trainer.model.state_dict(),
                    self.trainer.weights_file)
     elif zeus.is_tf_backend():
         worker_path = self.trainer.get_local_worker_path()
         model_id = "model_{}".format(self.trainer.worker_id)
         weights_folder = FileOps.join_path(worker_path, model_id)
         FileOps.make_dir(weights_folder)
         checkpoint_file = tf.train.latest_checkpoint(worker_path)
         ckpt_globs = glob.glob("{}.*".format(checkpoint_file))
         for _file in ckpt_globs:
             dst_file = model_id + os.path.splitext(_file)[-1]
             FileOps.copy_file(_file,
                               FileOps.join_path(weights_folder, dst_file))
         FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'),
                           weights_folder)
     elif zeus.is_ms_backend():
         worker_path = self.trainer.get_local_worker_path()
         save_path = os.path.join(
             worker_path, "model_{}.ckpt".format(self.trainer.worker_id))
         for file in os.listdir(worker_path):
             if file.startswith("CKP") and file.endswith(".ckpt"):
                 self.weights_file = FileOps.join_path(worker_path, file)
                 os.rename(self.weights_file, save_path)
Exemplo n.º 6
0
    def save_results(self):
        """Save the results of evolution contains the information of pupulation and elitism."""
        _path = FileOps.join_path(self.local_output_path, General.step_name)
        FileOps.make_dir(_path)
        arch_file = FileOps.join_path(_path, 'arch.txt')
        arch_child = FileOps.join_path(_path, 'arch_child.txt')
        sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy')
        sel_arch = []
        with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac:
            writer_a = csv.writer(fw_a, lineterminator='\n')
            writer_ac = csv.writer(fw_ac, lineterminator='\n')
            writer_ac.writerow(
                ['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.individual_num):
                writer_ac.writerow(
                    self._log_data(net_info_type='active_only',
                                   pop=self.pop[c],
                                   value=self.pop[c].fitness))

            writer_a.writerow(
                ['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.elitism_num):
                writer_a.writerow(
                    self._log_data(net_info_type='active_only',
                                   pop=self.elitism[c],
                                   value=self.elit_fitness[c]))
                sel_arch.append(self.elitism[c].gene)
        sel_arch = np.stack(sel_arch)
        np.save(sel_arch_file, sel_arch)
        if self.backup_base_path is not None:
            FileOps.copy_folder(self.local_output_path, self.backup_base_path)
Exemplo n.º 7
0
    def before_train(self, logs=None):
        """Be called before the whole train process."""
        self.trainer.config.call_metrics_on_train = False
        self.cfg = self.trainer.config
        self.worker_id = self.trainer.worker_id
        self.local_base_path = self.trainer.local_base_path
        self.local_output_path = self.trainer.local_output_path

        self.result_path = FileOps.join_path(self.trainer.local_base_path, "result")
        FileOps.make_dir(self.result_path)
        self.logger_patch()
Exemplo n.º 8
0
 def copy_pareto_output(self, step_name=None, worker_ids=[]):
     """Copy files related to pareto from  worker to output."""
     taskops = TaskOps()
     local_output_path = os.path.join(taskops.local_output_path, step_name)
     if not (step_name and os.path.exists(local_output_path)):
         return
     for worker_id in worker_ids:
         desDir = os.path.join(local_output_path, str(worker_id))
         FileOps.make_dir(desDir)
         local_worker_path = taskops.get_worker_subpath(
             step_name, str(worker_id))
         srcDir = FileOps.join_path(taskops.local_base_path,
                                    local_worker_path)
         copy_search_file(srcDir, desDir)
Exemplo n.º 9
0
def save_master_ip(ip_address, port, args):
    """Write the ip and port in a system path.

    :param str ip_address: The `ip_address` need to write.
    :param str port: The `port` need to write.
    :param argparse.ArgumentParser args: `args` is a argparse that should
         contain `init_method`, `rank` and `world_size`.

    """
    temp_folder = TaskOps().temp_path
    FileOps.make_dir(temp_folder)
    file_path = os.path.join(temp_folder, 'ip_address.txt')
    logging.info("write ip, file path={}".format(file_path))
    with open(file_path, 'w') as f:
        f.write(ip_address + "\n")
        f.write(port + "\n")
Exemplo n.º 10
0
def load_master_ip():
    """Get the ip and port that write in a system path.

    here will not download anything from S3.
    """
    temp_folder = TaskOps().temp_path
    FileOps.make_dir(temp_folder)
    file_path = os.path.join(temp_folder, 'ip_address.txt')
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            ip = f.readline().strip()
            port = f.readline().strip()
            logging.info("get write ip, ip={}, port={}".format(ip, port))
            return ip, port
    else:
        return None, None
Exemplo n.º 11
0
    def _save_performance(self, results):
        """Save performance into performance.pkl and save checkpoint to output_dir.

        :param results: performance results
        :type sr: dict
        """
        logging.info("performance=%s", str(results))
        performance_dir = os.path.join(self.worker_path, 'performance')
        FileOps.make_dir(performance_dir)
        FileOps.dump_pickle(results, os.path.join(performance_dir, 'performance.pkl'))
        logging.info("performance save to %s", performance_dir)
        # copy pth to output dir
        output_dir = os.path.join(self.output_path, str(self._worker_id))
        FileOps.make_dir(output_dir)
        shutil.copy(os.path.join(self.worker_path, 'latest.pth'),
                    os.path.join(output_dir, results['arch'].split('_')[1] + '.pth'))
        logging.info("Latest checkpoint save to %s", output_dir)
Exemplo n.º 12
0
 def _save_best_model(self):
     """Save best model."""
     if zeus.is_torch_backend():
         torch.save(self.trainer.model.state_dict(),
                    self.trainer.weights_file)
     elif zeus.is_tf_backend():
         worker_path = self.trainer.get_local_worker_path()
         model_id = "model_{}".format(self.trainer.worker_id)
         weights_folder = FileOps.join_path(worker_path, model_id)
         FileOps.make_dir(weights_folder)
         checkpoint_file = tf.train.latest_checkpoint(worker_path)
         ckpt_globs = glob.glob("{}.*".format(checkpoint_file))
         for _file in ckpt_globs:
             dst_file = model_id + os.path.splitext(_file)[-1]
             FileOps.copy_file(_file,
                               FileOps.join_path(weights_folder, dst_file))
         FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'),
                           weights_folder)
Exemplo n.º 13
0
    def update(self, record):
        """Update current performance into hpo score board.

        :param hps: hyper parameters need to update
        :param performance:  trainer performance
        """
        super().update(record)
        config_id = str(record.get('worker_id'))
        step_name = record.get('step_name')
        worker_result_path = self.get_local_worker_path(step_name, config_id)
        new_worker_result_path = FileOps.join_path(self.local_base_path,
                                                   'worker', 'cache',
                                                   config_id, 'checkpoint')
        FileOps.make_dir(worker_result_path)
        FileOps.make_dir(new_worker_result_path)
        if os.path.exists(new_worker_result_path):
            shutil.rmtree(new_worker_result_path)
        shutil.copytree(worker_result_path, new_worker_result_path)
Exemplo n.º 14
0
 def _output_records(self,
                     step_name,
                     records,
                     desc=True,
                     weights_file=False,
                     performance=False):
     """Dump records."""
     columns = ["worker_id", "performance", "desc"]
     outputs = []
     for record in records:
         record = record.serialize()
         _record = {}
         for key in columns:
             _record[key] = record[key]
         outputs.append(deepcopy(_record))
     data = pd.DataFrame(outputs)
     step_path = FileOps.join_path(TaskOps().local_output_path, step_name)
     FileOps.make_dir(step_path)
     _file = FileOps.join_path(step_path, "output.csv")
     try:
         data.to_csv(_file, index=False)
     except Exception:
         logging.error("Failed to save output file, file={}".format(_file))
     for record in outputs:
         worker_id = record["worker_id"]
         worker_path = TaskOps().get_local_worker_path(step_name, worker_id)
         outputs_globs = []
         if desc:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "desc_*.json"))
         if weights_file:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "model_*"))
         if performance:
             outputs_globs += glob.glob(
                 FileOps.join_path(worker_path, "performance_*.json"))
         for _file in outputs_globs:
             if os.path.isfile(_file):
                 FileOps.copy_file(_file, step_path)
             elif os.path.isdir(_file):
                 FileOps.copy_folder(
                     _file,
                     FileOps.join_path(step_path, os.path.basename(_file)))