def _get_current_step_records(self): step_name = General.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = ReportServer().load_records_from_model_folder( models_folder) else: records = self._load_single_model_records() final_records = [] for record in records: if not record.weights_file: logger.error("Model file is not existed, id={}".format( record.worker_id)) else: record.step_name = General.step_name final_records.append(record) logging.debug("Records: {}".format(final_records)) return final_records
def _simulate_tiny_pipeline(self, cfg_tiny): """Simulate tiny pipeline by using one sample one epoch.""" report = ReportServer() for i, step_name in enumerate(PipelineConfig.steps): step_cfg = cfg_tiny.get(step_name) if step_cfg.pipe_step.type != 'SearchPipeStep': continue step_cfg.trainer.distributed = False step_cfg.trainer.epochs = 1 self.restrict_config.trials[step_name] = 1 General.step_name = step_name PipeStepConfig.from_dict(step_cfg) pipestep = PipeStep() if i == 0: pipestep.do() record = report.get_step_records(step_name)[-1] self.epoch_time = record.runtime _worker_path = TaskOps().local_base_path if os.path.exists(_worker_path): os.system('rm -rf {}'.format(_worker_path)) if step_cfg.pipe_step.type == 'SearchPipeStep': self.params_dict[step_name]['max_samples'] = pipestep.generator.search_alg.max_samples _file = os.path.join(TaskOps().step_path, ".generator") if os.path.exists(_file): os.system('rm {}'.format(_file))
def query_task_info(): """Get task message.""" from vega.common.task_ops import TaskOps return { "result": "success", "task_id": TaskOps().task_id, "base_path": os.path.abspath(TaskOps().task_cfg.local_base_path), }
def _get_abs_path(cls, _path): if "{local_base_path}" in _path: from vega.common.task_ops import TaskOps return os.path.abspath( _path.replace("{local_base_path}", TaskOps().local_base_path)) return _path
def _backup_config(args): _file = args.config_file from vega.common.task_ops import TaskOps from vega.common.file_ops import FileOps dest_file = FileOps.join_path(TaskOps().local_output_path, os.path.basename(_file)) FileOps.make_base_dir(dest_file) FileOps.copy_file(_file, dest_file)
def __init__(self): self._load_config() vega.set_backend(General.backend, General.device_category) init_log(level=General.logger.level, log_file=f"{General.step_name}_worker_{self.worker_id}.log", log_path=TaskOps().local_log_path) self.report_client = ReportClient()
def restore(cls): """Restore generator from file.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".generator") if os.path.exists(_file): with open(_file, "rb") as f: return pickle.load(f) else: return None
def _calc_forward_latency_davinci(model, input, sess_config=None, num=10, evaluate_config=None): """Model forward latency calculation. :param model: network model :type model: torch or tf module :param input: input tensor :type input: Tensor of torch or tf :param num: forward number :type num: int :param evaluate_config: some config for evaluate in davinci :type evaluate_config: dict :return: forward latency :rtype: float """ from vega.evaluator.tools.evaluate_davinci_bolt import evaluate from vega.common.task_ops import TaskOps # backend = evaluate_config.get("backend") hardware = evaluate_config.get("hardware") remote_host = evaluate_config.get("remote_host") worker_path = TaskOps().local_base_path save_data_file = os.path.join(worker_path, "input.bin") latency = 0. now_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f') job_id = "pre_evaluate_" + now_time logging.info("The job id of evaluate service is {}.".format(job_id)) if vega.is_torch_backend(): import torch input_shape = input.shape if torch.is_tensor(input): input = input.cpu().numpy() input.tofile(save_data_file) for index in range(num): reuse_model = False if index == 0 else True results = evaluate("pytorch", hardware, remote_host, model, None, save_data_file, input_shape, reuse_model, job_id) latency += np.float(results.get("latency")) elif vega.is_tf_backend(): input_shape = input.shape.as_list() test_data = np.random.random(input_shape).astype(np.float32) test_data.tofile(save_data_file) for index in range(num): reuse_model = False if index == 0 else True results = evaluate("tensorflow", hardware, remote_host, model, None, save_data_file, input_shape, reuse_model, job_id) latency += np.float(results.get("latency")) return latency / num
def _show_performance(): output_file = FileOps.join_path(TaskOps().local_output_path, General.step_name, "output.csv") try: data = pd.read_csv(output_file) except Exception: logging.info(" Result file output.csv is not existed or empty.") return if data.shape[1] < 2 or data.shape[0] == 0: logging.info(" Result file output.csv is empty.") return logging.info("-" * 48) data = json.loads(data.to_json()) logging.info(" result: {}".format(data["performance"]["0"])) logging.info("-" * 48)
def convert_to_coco_api(ds): """Convert to coco dataset.""" coco_ds = COCO() ann_id = 1 dataset = {'images': [], 'categories': [], 'annotations': []} categories = set() for img_idx in range(len(ds)): img, targets = ds[img_idx] image_id = targets["image_id"].item() img_dict = {} img_dict['id'] = image_id img_dict['height'] = img.shape[-2] img_dict['width'] = img.shape[-1] dataset['images'].append(img_dict) bboxes = targets["boxes"] bboxes[:, 2:] -= bboxes[:, :2] bboxes = bboxes.tolist() labels = targets['labels'].tolist() areas = targets['area'].tolist() iscrowd = targets['iscrowd'].tolist() if 'masks' in targets: masks = targets['masks'] masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1) if 'keypoints' in targets: keypoints = targets['keypoints'] keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist() num_objs = len(bboxes) for i in range(num_objs): ann = {} ann['image_id'] = image_id ann['bbox'] = bboxes[i] ann['category_id'] = labels[i] categories.add(labels[i]) ann['area'] = areas[i] ann['iscrowd'] = iscrowd[i] ann['id'] = ann_id if 'keypoints' in targets: ann['keypoints'] = keypoints[i] ann['num_keypoints'] = sum(k != 0 for k in keypoints[i][2::3]) dataset['annotations'].append(ann) ann_id += 1 dataset['categories'] = [{'id': i} for i in sorted(categories)] coco_ds.dataset = dataset coco_ds.createIndex() instances_val = os.path.join(TaskOps().local_output_path, 'instances.json') json.dump(coco_ds.dataset, open(instances_val, 'w')) logging.info("dump detection instances json file: {}".format(instances_val)) return coco_ds
def load_master_ip(): """Get the ip and port that write in a system path. here will not download anything from S3. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') if os.path.isfile(file_path): with open(file_path, 'r') as f: ip = f.readline().strip() port = f.readline().strip() logging.info("get write ip, ip={}, port={}".format(ip, port)) return ip, port else: return None, None
def save_master_ip(ip_address, port, args): """Write the ip and port in a system path. :param str ip_address: The `ip_address` need to write. :param str port: The `port` need to write. :param argparse.ArgumentParser args: `args` is a argparse that should contain `init_method`, `rank` and `world_size`. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') logging.info("write ip, file path={}".format(file_path)) with open(file_path, 'w') as f: f.write(ip_address + "\n") f.write(port + "\n")
def _init_env(cfg_path): """Init config and evn parameters. :param cfg_path: config file path """ logging.getLogger().setLevel(logging.DEBUG) UserConfig().load(cfg_path) # load general General.from_dict(UserConfig().data.get("general"), skip_check=False) init_log(level=General.logger.level, log_file="pipeline.log", log_path=TaskOps().local_log_path) General.env = env_args() if not General.env: General.env = init_cluster_args() setattr(PipelineConfig, "steps", UserConfig().data.pipeline) set_backend(General.backend, General.device_category)
def summary(self): """Summary all record from result cache, and get performance.""" if not self.result_record: return {"mAP": -1, "AP_small": -1, "AP_medium": -1, "AP_large": -1} det_json_file = os.path.join(TaskOps().local_output_path, 'det_json_file.json') with open(det_json_file, 'w') as f: json.dump(self.result_record, f) eval_result = self.print_scores(det_json_file, self.anno_path) ap_result = eval_result.pop('AP(bbox)') ap_result = list(ap_result) ap_result = { "mAP": ap_result[0] * 100, "AP50": ap_result[1] * 100, "AP_small": ap_result[3] * 100, "AP_medium": ap_result[4] * 100, "AP_large": ap_result[5] * 100 } if eval_result: ap_result.update(eval_result) return ap_result
def _save_worker_record(self, record): step_name = record.get('step_name') worker_id = record.get('worker_id') _path = TaskOps().get_local_worker_path(step_name, worker_id) for record_name in ["desc", "hps", "performance"]: _file_name = None _file = None record_value = remove_np_value(record.get(record_name)) if record_value is None: if record_name == "desc": record_value = {} else: continue _file = None try: # for cars/darts save multi-desc if isinstance(record_value, list) and record_name == "desc": for idx, value in enumerate(record_value): _file_name = "desc_{}.json".format(idx) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(value, f) else: if 'multi_task' in record: worker_id = record.get('multi_task') if record.get( 'multi_task') is not None else worker_id _file_name = None if record_name == "desc": _file_name = "desc_{}.json".format(worker_id) if record_name == "hps": _file_name = "hps_{}.json".format(worker_id) if record_name == "performance": _file_name = "performance_{}.json".format(worker_id) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(record_value, f) except Exception as ex: logger.error( "Failed to save {}, file={}, desc={}, msg={}".format( record_name, _file, record_value, str(ex)))
def _load_single_model_records(self): model_desc = PipeStepConfig.model.model_desc model_desc_file = PipeStepConfig.model.model_desc_file if model_desc_file: model_desc_file = model_desc_file.replace( "{local_base_path}", TaskOps().local_base_path) model_desc = Config(model_desc_file) if not model_desc: logger.error("Model desc or Model desc file is None.") return [] model_file = PipeStepConfig.model.pretrained_model_file if not model_file: logger.error("Model file is None.") return [] if not os.path.exists(model_file): logger.error("Model file is not existed.") return [] return [ ReportRecord().load_dict( dict(worker_id="1", desc=model_desc, weights_file=model_file)) ]
def __init__(self, anno_path=None, category=None): self.anno_path = anno_path or os.path.join(TaskOps().local_output_path, 'instances.json') self.category = category or [] self.result_record = []
def dump(self): """Dump generator to file.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".generator") with open(_file, "wb") as f: pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
def _init_env(): if sys.version_info < (3, 6): sys.exit('Sorry, Python < 3.6 is not supported.') init_log(level=General.logger.level, log_path=TaskOps().local_log_path) General.env = init_cluster_args() _print_task_id()