def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('training gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # scale learning rate by number of gpus is_dict_of_dict = True for _, cfg_ in cfg.optimizers.items(): if not isinstance(cfg_, dict): is_dict_of_dict = False if is_dict_of_dict: for _, cfg_ in cfg.optimizers.items(): cfg_['lr'] = cfg_['lr'] * world_size else: raise RuntimeError( "please use 'dict of dict' style for optimizers config") # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def inference(model_file, device, records, result_queue): def val_func(): pred_boxes = net(net.inputs) return pred_boxes mge.set_default_device('xpu{}'.format(device)) net = network.Network() net = load_model(net, model_file) net.eval() for record in records: np.set_printoptions(precision=2, suppress=True) image, gt_boxes, im_info, ID = get_data(record, device) net.inputs["image"].set_value(image.astype(np.float32)) net.inputs["im_info"].set_value(im_info) del record, image pred_boxes = val_func().numpy() pred_bbox = pred_boxes[:, 1] scale = im_info[0, 2] cls_dets = pred_bbox[:, :4] / scale if config.test_nms_version == 'set_nms': n = cls_dets.shape[0] // 2 idents = np.tile(np.linspace(0, n-1, n).reshape(-1, 1),(1, 2)).reshape(-1, 1) pred_boxes = np.hstack([cls_dets, pred_bbox[:,4:5], idents]) flag = pred_boxes[:, 4] >= config.test_cls_threshold cls_dets = pred_boxes[flag] keep = emd_cpu_nms(cls_dets, config.test_nms) cls_dets = cls_dets[keep, :5].astype(np.float64) elif config.test_nms_version == 'normal_nms': pred_boxes = np.hstack([cls_dets, pred_bbox[:, 4:5]]) flag = pred_boxes[:, 4] >= config.test_cls_threshold cls_dets = pred_boxes[flag] keep = nms(cls_dets.astype(np.float32), config.test_nms) cls_dets = cls_dets[keep, :5].astype(np.float64) else: raise NotImplementedError('the results should be post processed.') pred_tags = np.ones([cls_dets.shape[0],]).astype(np.float64) gt_boxes = gt_boxes.astype(np.float64) dtboxes = boxes_dump(cls_dets[:, :5], pred_tags, False) gtboxes = boxes_dump(gt_boxes, None, True) # im_info = im_info.astype(np.int32) # height, width = im_info[0, 3], im_info[0, 4] height, width = int(im_info[0, 3]), int(im_info[0, 4]) result_dict = dict(ID=ID, height=height, width=width, dtboxes = dtboxes, gtboxes = gtboxes) result_queue.put_nowait(result_dict)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) logger.info('Config:\n{}'.format(cfg.text)) gpu_list = [item.strip() for item in args.gpuids.split(",")] if gpu_list[0] == "-1": world_size = 0 # use cpu logger.info('test use only cpu') else: world_size = len(gpu_list) logger.info('test gpus num: {}'.format(world_size)) # assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') elif world_size == 1: mge.set_default_device(device='gpu' + gpu_list[0]) else: pass if world_size > 1: port = dist.util.get_free_ports(1)[0] server = dist.Server(port) processes = [] for rank in range(world_size): logger.info("init distributed process group {} / {}".format( rank, world_size)) p = mp.Process(target=worker, args=(rank, world_size, cfg, gpu_list[rank], port)) p.start() processes.append(p) for rank in range(world_size): processes[rank].join() code = processes[rank].exitcode assert code == 0, "subprocess {} exit with code {}".format( rank, code) else: worker(0, 1, cfg)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--arch", default="resnet18", type=str) parser.add_argument("-d", "--data", default=None, type=str) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-c", "--checkpoint", default=None, type=str, help="pretrained model to finetune") parser.add_argument( "-m", "--mode", default="qat", type=str, choices=["normal", "qat", "quantized"], help="Quantization Mode\n" "normal: no quantization, using float32\n" "qat: quantization aware training, simulate int8\n" "quantized: convert mode to int8 quantized, inference only") parser.add_argument("-n", "--ngpus", default=None, type=int) parser.add_argument("-w", "--workers", default=4, type=int) parser.add_argument("--report-freq", default=50, type=int) args = parser.parse_args() world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus if args.mode == "quantized": world_size = 1 args.report_freq = 1 # test is slow on cpu mge.set_default_device("cpux") logger.warning("quantized mode use cpu only") if world_size > 1: # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('test gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # start distributed test, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def test_save_load(): net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) optim.clear_grad() gm = ad.GradManager().attach(net.parameters()) data = tensor([2.34]) with gm: loss = net(data) gm.backward(loss) optim.step() model_name = "simple.pkl" print("save to {}".format(model_name)) mge.save( { "name": "simple", "state_dict": net.state_dict(), "opt_state": optim.state_dict(), }, model_name, ) # Load param to cpu checkpoint = mge.load(model_name, map_location="cpu0") device_save = mge.get_default_device() mge.set_default_device("cpu0") net = Simple() net.load_state_dict(checkpoint["state_dict"]) optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) optim.load_state_dict(checkpoint["opt_state"]) print("load done") os.remove("simple.pkl") with gm: loss = net([1.23]) gm.backward(loss) optim.step() # Restore device mge.set_default_device(device_save)
def inference(model_file, device, records, result_queue): def val_func(): pred_boxes = net(net.inputs) return pred_boxes mge.set_default_device('xpu{}'.format(device)) net = network.Network() net = load_model(net, model_file) net.eval() for record in records: np.set_printoptions(precision=2, suppress=True) image, gt_boxes, im_info, ID = get_data(record, device) net.inputs["image"].set_value(image.astype(np.float32)) net.inputs["im_info"].set_value(im_info) del record, image pred_boxes = val_func().numpy() scale = im_info[0, 2] pred_bbox = pred_boxes[:, :4] / scale scores = np.prod(pred_boxes[:,4:6], axis=1) n = scores.shape[0] // 2 idents = np.tile(np.linspace(0, n-1, n).reshape(-1, 1),(1, 2)).reshape(-1, 1) pred_boxes = np.hstack([pred_bbox, scores.reshape(-1, 1), idents]) flag = pred_boxes[:, 4] >= 0.05 cls_dets = pred_boxes[flag] keep = emd_cpu_nms(cls_dets, 0.5) cls_dets = cls_dets[keep].astype(np.float64) pred_tags = np.ones([cls_dets.shape[0],]).astype(np.float64) gt_boxes = gt_boxes.astype(np.float64) dtboxes = boxes_dump(cls_dets[:, :5], pred_tags, False) gtboxes = boxes_dump(gt_boxes, None, True) height, width = int(im_info[0, 3]), int(im_info[0, 4]) result_dict = dict(ID=ID, height=height, width=width, dtboxes = dtboxes, gtboxes = gtboxes) result_queue.put_nowait(result_dict)
def test_tensor_serialization(): with TemporaryFile() as f: data = np.random.randint(low=0, high=7, size=[233]) a = Tensor(data, device="cpu0", dtype=np.int32) mge.save(a, f) f.seek(0) b = mge.load(f) np.testing.assert_equal(a.numpy(), data) assert b.device.logical_name == "cpu0:0" assert b.dtype == np.int32 with TemporaryFile() as f: a = Parameter(np.random.random(size=(233, 2)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f) assert isinstance(b, Parameter) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f) assert type(b) is Tensor np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f, map_location="cpux") assert type(b) is Tensor assert "cpu" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: if mge.is_cuda_available(): device_org = mge.get_default_device() mge.set_default_device("gpu0") a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) mge.set_default_device("cpux") b = mge.load(f, map_location={"gpu0": "cpu0"}) assert type(b) is Tensor assert "cpu0" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) mge.set_default_device(device_org) with TemporaryFile() as f: a = Tensor(0) a.qparams.scale = Tensor(1.0) mge.save(a, f) f.seek(0) b = mge.load(f) assert isinstance(b.qparams.scale, Tensor) np.testing.assert_equal(b.qparams.scale.numpy(), 1.0)
def test_tensor_serialization(): def tensor_eq(a, b): assert a.dtype == b.dtype assert a.device == b.device np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: data = np.random.randint(low=0, high=7, size=[233]) a = Tensor(data, device="xpux", dtype=np.int32) pickle.dump(a, f) f.seek(0) b = pickle.load(f) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Parameter(np.random.random(size=(233, 2)).astype(np.float32)) pickle.dump(a, f) f.seek(0) b = pickle.load(f) assert isinstance(b, Parameter) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) pickle.dump(a, f) f.seek(0) b = pickle.load(f) assert type(b) is Tensor np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f, map_location="cpux") assert type(b) is Tensor assert "cpu" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: if mge.is_cuda_available(): device_org = mge.get_default_device() mge.set_default_device("gpu0") a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) mge.set_default_device("cpux") b = mge.load(f, map_location={"gpu0": "cpu0"}) assert type(b) is Tensor assert "cpu0" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) mge.set_default_device(device_org)
def main(): if not mge.is_cuda_available(): mge.set_default_device("cpux") net = XORNet() gm = ad.GradManager().attach(net.parameters()) opt = optim.SGD(net.parameters(), lr=0.01, momentum=0.9) batch_size = 64 train_dataset = minibatch_generator(batch_size) val_dataset = minibatch_generator(batch_size) def train_fun(data, label): opt.clear_grad() with gm: pred = net(data) loss = F.loss.cross_entropy(pred, label) gm.backward(loss) opt.step() return pred, loss def val_fun(data, label): pred = net(data) loss = F.loss.cross_entropy(pred, label) return pred, loss @trace(symbolic=True, capture_as_const=True) def pred_fun(data): pred = net(data) pred_normalized = F.softmax(pred) return pred_normalized data = np.random.random((batch_size, 2)).astype(np.float32) label = np.zeros((batch_size,)).astype(np.int32) train_loss = [] val_loss = [] for step, minibatch in enumerate(train_dataset): if step > 1000: break data = mge.tensor(minibatch["data"]) label = mge.tensor(minibatch["label"]) net.train() _, loss = train_fun(data, label) train_loss.append((step, loss.numpy())) if step % 50 == 0: minibatch = next(val_dataset) net.eval() _, loss = val_fun(data, label) loss = loss.numpy() val_loss.append((step, loss)) print("Step: {} loss={}".format(step, loss)) opt.step() test_data = np.array( [ (0.5, 0.5), (0.3, 0.7), (0.1, 0.9), (-0.5, -0.5), (-0.3, -0.7), (-0.9, -0.1), (0.5, -0.5), (0.3, -0.7), (0.9, -0.1), (-0.5, 0.5), (-0.3, 0.7), (-0.1, 0.9), ] ) # tracing only accepts tensor as input data = mge.tensor(test_data, dtype=np.float32) net.eval() out = pred_fun(data) pred_output = out.numpy() pred_label = np.argmax(pred_output, 1) print("Test data") print(test_data) with np.printoptions(precision=4, suppress=True): print("Predicated probability:") print(pred_output) print("Predicated label") print(pred_label) model_name = "xornet_deploy.mge" print("Dump model as {}".format(model_name)) pred_fun.dump(model_name, arg_names=["data"]) model_with_testcase_name = "xornet_with_testcase.mge" print("Dump model with testcase as {}".format(model_with_testcase_name)) pred_fun.dump(model_with_testcase_name, arg_names=["data"], input_data=["#rand(0.1, 0.8, 4, 2)"])
def main(): if not mge.is_cuda_available(): mge.set_default_device("cpux") net = XORNet() opt = optim.SGD(net.parameters(requires_grad=True), lr=0.01, momentum=0.9) batch_size = 64 train_dataset = minibatch_generator(batch_size) val_dataset = minibatch_generator(batch_size) data = mge.tensor() label = mge.tensor(np.zeros((batch_size, )), dtype=np.int32) train_loss = [] val_loss = [] for step, minibatch in enumerate(train_dataset): if step > 1000: break data.set_value(minibatch["data"]) label.set_value(minibatch["label"]) opt.zero_grad() _, loss = train_fun(data, label, net=net, opt=opt) train_loss.append((step, loss.numpy())) if step % 50 == 0: minibatch = next(val_dataset) _, loss = val_fun(data, label, net=net) loss = loss.numpy()[0] val_loss.append((step, loss)) print("Step: {} loss={}".format(step, loss)) opt.step() test_data = np.array([ (0.5, 0.5), (0.3, 0.7), (0.1, 0.9), (-0.5, -0.5), (-0.3, -0.7), (-0.9, -0.1), (0.5, -0.5), (0.3, -0.7), (0.9, -0.1), (-0.5, 0.5), (-0.3, 0.7), (-0.1, 0.9), ]) data.set_value(test_data) out = pred_fun(data, net=net) pred_output = out.numpy() pred_label = np.argmax(pred_output, 1) print("Test data") print(test_data) with np.printoptions(precision=4, suppress=True): print("Predicated probability:") print(pred_output) print("Predicated label") print(pred_label) model_name = "xornet_deploy.mge" if pred_fun.enabled: print("Dump model as {}".format(model_name)) pred_fun.dump(model_name, arg_names=["data"]) else: print("pred_fun must be run with trace enabled in order to dump model")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--arch", default="resnet18", type=str) parser.add_argument("-c", "--checkpoint", default=None, type=str) parser.add_argument("-i", "--image", default=None, type=str) parser.add_argument( "-m", "--mode", default="quantized", type=str, choices=["normal", "qat", "quantized"], help="Quantization Mode\n" "normal: no quantization, using float32\n" "qat: quantization aware training, simulate int8\n" "quantized: convert mode to int8 quantized, inference only") parser.add_argument("--dump", action="store_true", help="Dump quantized model") args = parser.parse_args() if args.mode == "quantized": mge.set_default_device("cpux") model = models.__dict__[args.arch]() if args.mode != "normal": Q.quantize_qat(model, Q.ema_fakequant_qconfig) if args.checkpoint: logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) if args.mode == "quantized": Q.quantize(model) if args.image is None: path = "../assets/cat.jpg" else: path = args.image image = cv2.imread(path, cv2.IMREAD_COLOR) transform = T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW"), ]) @jit.trace(symbolic=True) def infer_func(processed_img): model.eval() logits = model(processed_img) probs = F.softmax(logits) return probs processed_img = transform.apply(image)[np.newaxis, :] if args.mode == "normal": processed_img = processed_img.astype("float32") elif args.mode == "quantized": processed_img = processed_img.astype("int8") probs = infer_func(processed_img) top_probs, classes = F.top_k(probs, k=5, descending=True) if args.dump: output_file = ".".join([args.arch, args.mode, "megengine"]) logger.info("Dump to {}".format(output_file)) infer_func.dump(output_file, arg_names=["data"]) mge.save(model.state_dict(), output_file.replace("megengine", "pkl")) with open("../assets/imagenet_class_info.json") as fp: imagenet_class_index = json.load(fp) for rank, (prob, classid) in enumerate( zip(top_probs.numpy().reshape(-1), classes.numpy().reshape(-1))): print("{}: class = {:20s} with probability = {:4.1f} %".format( rank, imagenet_class_index[str(classid)][1], 100 * prob))