def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="simplebaseline_res50", type=str, choices=[ "simplebaseline_res50", "simplebaseline_res101", "simplebaseline_res152", ], ) parser.add_argument("--pretrained", default=True, type=bool) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("--data_root", default="/data/coco/images/", type=str) parser.add_argument( "--ann_file", default="/data/coco/annotations/person_keypoints_train2017.json", type=str, ) parser.add_argument("--continue", default=None, type=str) parser.add_argument("-b", "--batch_size", default=64, type=int) parser.add_argument("--lr", default=6e-4, type=float) parser.add_argument("--epochs", default=200, type=int) parser.add_argument("--multi_scale_supervision", default=True, type=bool) parser.add_argument("-n", "--ngpus", default=8, type=int) parser.add_argument("-w", "--workers", default=8, type=int) parser.add_argument("--report-freq", default=10, type=int) args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus args.lr *= world_size # start distributed training, dispatch sub-processes processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="simplebaseline_res50", type=str, choices=cfg.model_choices, ) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-b", "--batch_size", default=32, type=int) parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float) parser.add_argument("--resume", default=None, type=str) parser.add_argument("--multi_scale_supervision", action="store_true") parser.add_argument("-n", "--ngpus", default=8, type=int) parser.add_argument("-w", "--workers", default=8, type=int) args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) if args.batch_size != cfg.batch_size: cfg.batch_size = args.batch_size if args.initial_lr != cfg.initial_lr: cfg.initial_lr = args.initial_lr world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) cfg.weight_decay *= world_size # start distributed training, dispatch sub-processes processes = [] for rank in range(world_size): p = mp.Process( target=worker, args=(master_ip, port, rank, world_size, args) ) p.start() processes.append(p) for p in processes: p.join() else: worker(None, None, 0, 1, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="resnet50", type=str, choices=[ "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnext50_32x4d", "resnext101_32x8d", ], ) parser.add_argument("-d", "--data", default=None, type=str) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-b", "--batch-size", default=32, type=int) parser.add_argument("--learning-rate", default=0.0125, type=float) parser.add_argument("--momentum", default=0.9, type=float) parser.add_argument("--weight-decay", default=1e-4, type=float) parser.add_argument("--epochs", default=90, type=int) parser.add_argument("-n", "--ngpus", default=None, type=int) parser.add_argument("-w", "--workers", default=4, type=int) parser.add_argument("--report-freq", default=50, type=int) args = parser.parse_args() save_dir = os.path.join(args.save, args.arch) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus args.learning_rate *= world_size # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--arch", default="shufflenet_v2_x0_5", type=str) parser.add_argument("-d", "--data", default=None, type=str) parser.add_argument("-s", "--save", default="./models", type=str) parser.add_argument("-m", "--model", default=None, type=str) parser.add_argument('-o', '--output', type=str, required=True, help='set path for checkpoints \w tensorboard') parser.add_argument("-b", "--batch-size", default=128, type=int) parser.add_argument("--learning-rate", default=0.0625, type=float) parser.add_argument("--momentum", default=0.9, type=float) parser.add_argument("--weight-decay", default=4e-5, type=float) parser.add_argument("--steps", default=300000, type=int) parser.add_argument("-n", "--ngpus", default=None, type=int) parser.add_argument("-w", "--workers", default=4, type=int) parser.add_argument("--report-freq", default=50, type=int) args = parser.parse_args() world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus save_dir = os.path.join(args.save, args.arch, "b{}".format(args.batch_size * world_size)) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) if not os.path.exists(args.output): os.makedirs(args.output) if world_size > 1: # scale learning rate by number of gpus args.learning_rate *= world_size # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def main(args): configs = load_config_from_path(args.config_file) configs["evaluate_epoch"] = args.epoch if args.epoch is not None else configs["num_epoch"] # write log to worklog.txt os.makedirs(configs["base_dir"], exist_ok=True) worklog_path = os.path.join(configs["base_dir"], "worklog.txt") mge.set_log_file(worklog_path) inference_func = get_inference_func(configs) facescrub_feature, facescrub_label, megaface_feature = extract_feature_and_clean_noise(configs, inference_func) megaface_score = calculate_score(configs, facescrub_feature, facescrub_label, megaface_feature) logger.info("Epoch: %d", configs["evaluate_epoch"]) logger.info("MegaFace Top1: %.2f", megaface_score)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="simplebaseline_res50", type=str, choices=cfg.model_choices, ) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-b", "--batch_size", default=32, type=int) parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float) parser.add_argument("--resume", default=None, type=str) parser.add_argument("--multi_scale_supervision", action="store_true") parser.add_argument("-n", "--ngpus", default=8, type=int) parser.add_argument("-w", "--workers", default=8, type=int) args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) if args.batch_size != cfg.batch_size: cfg.batch_size = args.batch_size if args.initial_lr != cfg.initial_lr: cfg.initial_lr = args.initial_lr if args.ngpus is None: args.ngpus = dist.helper.get_device_count_by_fork("gpu") if args.ngpus > 1: # scale learning rate by number of gpus cfg.weight_decay *= args.ngpus dist_worker = dist.launcher(n_gpus=args.ngpus)(worker) dist_worker(args) else: worker(args)
def worker(rank, world_size, args): # pylint: disable=too-many-statements mge.set_log_file(os.path.join(args.save, args.arch, "log.txt")) if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch) model = getattr(M, args.arch)() step_start = 0 if args.model: logger.info("load weights from %s", args.model) model.load_state_dict(mge.load(args.model)) step_start = int(args.model.split("-")[1].split(".")[0]) optimizer = optim.SGD( get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=args.batch_size, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), T.ToMode("CHW"), ]), num_workers=args.workers, ) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.ToMode("CHW"), ]), num_workers=args.workers, ) # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(step_start, args.steps + 1): # Linear learning rate decay decay = 1.0 decay = 1 - float(step) / args.steps if step < args.steps else 0 for param_group in optimizer.param_groups: param_group["lr"] = args.learning_rate * decay image, label = next(train_queue) time_data = time.time() - t image = image.astype("float32") label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) time_iter = time.time() - t t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info( "TRAIN Iter %06d: lr = %f,\tloss = %f,\twc_loss = 1,\tTop-1 err = %f,\tTop-5 err = %f,\tdata_time = %f,\ttrain_time = %f,\tremain_hours=%f", step, args.learning_rate * decay, float(objs.__str__().split()[1]), 1 - float(top1.__str__().split()[1]) / 100, 1 - float(top5.__str__().split()[1]) / 100, time_data, time_iter - time_data, time_iter * (args.steps - step) / 3600, ) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and rank == 0 and step != 0: logger.info("SAVING %06d", step) mge.save( model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step)), ) if step % 50000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info( "TEST Iter %06d: loss = %f,\tTop-1 err = %f,\tTop-5 err = %f", step, _, 1 - valid_acc / 100, 1 - valid_acc5 / 100) mge.save(model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step))) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST Iter %06d: loss=%f,\tTop-1 err = %f,\tTop-5 err = %f", step, _, 1 - valid_acc / 100, 1 - valid_acc5 / 100)
import numpy as np import tensorflow as tf from datetime import datetime import megengine as mge import megengine.functional as F from megengine.data import RandomSampler, SequentialSampler, DataLoader from megengine.data.dataset import MNIST from megengine.data.transform import RandomResizedCrop, Normalize, ToMode, Pad, Compose import megengine.optimizer as optim mge.set_log_file('log.txt') logger = mge.get_logger(__name__) #logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S") # dataset root_dir = '/data/.cache/dataset/MNIST' mnist_train_dataset = MNIST(root=root_dir, train=True, download=False) mnist_test_dataset = MNIST(root=root_dir, train=False, download=False) random_sampler = RandomSampler(dataset=mnist_train_dataset, batch_size=256) sequential_sampler = SequentialSampler(dataset=mnist_test_dataset, batch_size=256) mnist_train_dataloader = DataLoader( dataset=mnist_train_dataset, sampler=random_sampler, transform=Compose([ RandomResizedCrop(output_size=28),
def worker(master_ip, port, world_size, rank, configs): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("init process group for gpu{} done".format(rank)) # set up logger os.makedirs(configs["base_dir"], exist_ok=True) worklog_path = os.path.join(configs["base_dir"], "worklog.txt") mge.set_log_file(worklog_path) # prepare model-related components model = FaceRecognitionModel(configs) # prepare data-related components preprocess = T.Compose([T.Normalize(mean=127.5, std=128), T.ToMode("CHW")]) augment = T.Compose([T.RandomHorizontalFlip()]) train_dataset = get_train_dataset(configs["dataset"], dataset_dir=configs["dataset_dir"]) train_sampler = data.RandomSampler(train_dataset, batch_size=configs["batch_size"], drop_last=True) train_queue = data.DataLoader(train_dataset, sampler=train_sampler, transform=T.Compose([augment, preprocess])) # prepare optimize-related components configs["learning_rate"] = configs["learning_rate"] * dist.get_world_size() if dist.get_world_size() > 1: dist.bcast_list_(model.parameters()) gm = ad.GradManager().attach( model.parameters(), callbacks=[dist.make_allreduce_cb("mean")]) else: gm = ad.GradManager().attach(model.parameters()) opt = optim.SGD( model.parameters(), lr=configs["learning_rate"], momentum=configs["momentum"], weight_decay=configs["weight_decay"], ) # try to load checkpoint model, start_epoch = try_load_latest_checkpoint(model, configs["base_dir"]) # do training def train_one_epoch(): def train_func(images, labels): opt.clear_grad() with gm: loss, accuracy, _ = model(images, labels) gm.backward(loss) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum( loss) / dist.get_world_size() accuracy = dist.functional.all_reduce_sum( accuracy) / dist.get_world_size() opt.step() return loss, accuracy model.train() average_loss = AverageMeter("loss") average_accuracy = AverageMeter("accuracy") data_time = AverageMeter("data_time") train_time = AverageMeter("train_time") total_step = len(train_queue) data_iter = iter(train_queue) for step in range(total_step): # get next batch of data data_tic = time.time() images, labels = next(data_iter) data_toc = time.time() # forward pass & backward pass train_tic = time.time() images = mge.tensor(images, dtype="float32") labels = mge.tensor(labels, dtype="int32") loss, accuracy = train_func(images, labels) train_toc = time.time() # do the statistics and logging n = images.shape[0] average_loss.update(loss.item(), n) average_accuracy.update(accuracy.item() * 100, n) data_time.update(data_toc - data_tic) train_time.update(train_toc - train_tic) if step % configs["log_interval"] == 0 and dist.get_rank() == 0: logger.info( "epoch: %d, step: %d, %s, %s, %s, %s", epoch, step, average_loss, average_accuracy, data_time, train_time, ) for epoch in range(start_epoch, configs["num_epoch"]): adjust_learning_rate(opt, epoch, configs) train_one_epoch() if dist.get_rank() == 0: checkpoint_path = os.path.join(configs["base_dir"], f"epoch-{epoch+1}-checkpoint.pkl") mge.save( { "epoch": epoch + 1, "state_dict": model.state_dict() }, checkpoint_path, )
def worker(rank, world_size, args): # pylint: disable=too-many-statements if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_finetune_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS if args.mode != "normal": Q.quantize_qat(model, Q.ema_fakequant_qconfig) if args.checkpoint: logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) if args.mode == "quantized": raise ValueError("mode = quantized only used during inference") Q.quantize(model) optimizer = optim.SGD( get_parameters(model, cfg), lr=cfg.LEARNING_RATE, momentum=cfg.MOMENTUM, ) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=cfg.BATCH_SIZE, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), cfg.COLOR_JITTOR, T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) train_queue = iter(train_queue) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) def adjust_learning_rate(step, epoch): learning_rate = cfg.LEARNING_RATE if cfg.SCHEDULER == "Linear": learning_rate *= 1 - float(step) / total_steps elif cfg.SCHEDULER == "Multistep": learning_rate *= cfg.SCHEDULER_GAMMA**bisect.bisect_right( cfg.SCHEDULER_STEPS, epoch) else: raise ValueError(cfg.SCHEDULER) for param_group in optimizer.param_groups: param_group["lr"] = learning_rate return learning_rate # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(0, total_steps): # Linear learning rate decay epoch = step // steps_per_epoch learning_rate = adjust_learning_rate(step, epoch) image, label = next(train_queue) image = image.astype("float32") label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info("TRAIN e%d %06d %f %s %s %s %s", epoch, step, learning_rate, objs, top1, top5, total_time) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and rank == 0: logger.info("SAVING %06d", step) mge.save( { "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint.pkl"), ) if step % 10000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5) mge.save({ "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint-final.pkl")) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
def worker(world_size, args): # pylint: disable=too-many-statements rank = dist.get_rank() if world_size > 1: logger.info("init distributed process group {} / {}".format( rank, world_size)) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS if args.mode != "normal": quantize_qat(model, qconfig=Q.ema_fakequant_qconfig) if world_size > 1: # Sync parameters dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("MEAN") if world_size > 1 else None, ) optimizer = optim.SGD( get_parameters(model, cfg), lr=cfg.LEARNING_RATE, momentum=cfg.MOMENTUM, ) # Define train and valid graph def train_func(image, label): with gm: model.train() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) gm.backward(loss) optimizer.step().clear_grad() return loss, acc1, acc5 def valid_func(image, label): model.eval() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=cfg.BATCH_SIZE, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), cfg.COLOR_JITTOR, T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) train_queue = iter(train_queue) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW") ]), num_workers=args.workers, ) def adjust_learning_rate(step, epoch): learning_rate = cfg.LEARNING_RATE if cfg.SCHEDULER == "Linear": learning_rate *= 1 - float(step) / total_steps elif cfg.SCHEDULER == "Multistep": learning_rate *= cfg.SCHEDULER_GAMMA**bisect.bisect_right( cfg.SCHEDULER_STEPS, epoch) else: raise ValueError(cfg.SCHEDULER) for param_group in optimizer.param_groups: param_group["lr"] = learning_rate return learning_rate # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(0, total_steps): # Linear learning rate decay epoch = step // steps_per_epoch learning_rate = adjust_learning_rate(step, epoch) image, label = next(train_queue) image = mge.tensor(image, dtype="float32") label = mge.tensor(label, dtype="int32") n = image.shape[0] loss, acc1, acc5 = train_func(image, label) top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info( "TRAIN e%d %06d %f %s %s %s %s", epoch, step, learning_rate, objs, top1, top5, total_time, ) objs.reset() top1.reset() top5.reset() total_time.reset() if step != 0 and step % 10000 == 0 and rank == 0: logger.info("SAVING %06d", step) mge.save( { "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint.pkl"), ) if step % 10000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5) mge.save( { "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint-final.pkl"), ) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
def worker(world_size, args): # pylint: disable=too-many-statements rank = dist.get_rank() if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) save_dir = os.path.join(args.save, args.arch + "." + "calibration") if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() # load calibration model assert args.checkpoint logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) # Build valid datasets valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler( valid_dataset, batch_size=100, drop_last=False ) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose( [T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW")] ), num_workers=args.workers, ) # calibration model.fc.disable_quantize() model = quantize_qat(model, qconfig=Q.calibration_qconfig) # calculate scale def calculate_scale(image, label): model.eval() enable_observer(model) logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.functional.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.functional.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 infer(calculate_scale, valid_queue, args) # quantized model = quantize(model) # eval quantized model def eval_func(image, label): model.eval() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.functional.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.functional.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 _, valid_acc, valid_acc5 = infer(eval_func, valid_queue, args) logger.info("TEST %f, %f", valid_acc, valid_acc5) # save quantized model mge.save( {"step": -1, "state_dict": model.state_dict()}, os.path.join(save_dir, "checkpoint-calibration.pkl"), ) logger.info( "save in {}".format(os.path.join(save_dir, "checkpoint-calibration.pkl")) )
from megengine import optimizer as optim import megengine.autodiff as autodiff from megengine import jit # import dataset import network from config import config as cfg from dataset.CrowdHuman import CrowdHuman from misc_utils import ensure_dir from megengine.core._imperative_rt.utils import Logger from megengine import data import pdb ensure_dir(cfg.output_dir) logger = mge.get_logger(__name__) log_path = osp.join(cfg.output_dir, 'logger.log') mge.set_log_file(log_path, mode='a') Logger.set_log_level(Logger.LogLevel.Error) def find_free_port(): import socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Binding to port 0 will cause the OS to find an available port for us sock.bind(("", 0)) port = sock.getsockname()[1] sock.close() # NOTE: there is still a chance the port could be taken by other processes. return port def allreduce_cb(param, grad, group=dist.WORLD): return dist.functional.all_reduce_sum(grad, group) / group.size def train_one_epoch(model, gm, data_iter, opt, max_steps, rank, epoch_id, gpu_num):
logger = mge.get_logger(__name__) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("-e", "--embedding", required=True, type=int, help="") parser.add_argument("-i", "--input", required=True, type=int, help="") parser.add_argument("-s", "--save", required=True, type=str, help="") parser.add_argument("--epoch", required=True, type=int, help="") args = parser.parse_args() mkdir_p(args.save) logfile = os.path.join(args.save, "log.txt") #open(logfile, 'w') mge.set_log_file(logfile) train_dataset = PCADataset(args.input, setting.points_num, setting.batch_size) test_dataset = PCADataset(args.input, setting.points_num, batch_size=setting.batch_size, istrain=False) model = TwolayerFC(args.embedding, args.input) data = mge.tensor(dtype='float32') label = mge.tensor(dtype="float32") optimizer = optim.SGD( model.parameters(), # 参数列表,将指定参数与优化器绑定 lr=setting.learning_rate, # 学习速率 ) total_epochs = args.epoch for epoch in range(total_epochs): model.train() train_batch_generator = train_dataset.batch_generator()
def worker(rank, world_size, args): # pylint: disable=too-many-statements if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_finetune_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS # load calibration model assert args.checkpoint logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) # Build valid datasets valid_dataset = data.dataset.ImageNet(args.data, train=False) # valid_dataset = ImageNetNoriDataset(args.data) valid_sampler = data.SequentialSampler( valid_dataset, batch_size=100, drop_last=False ) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose( [ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW"), ] ), num_workers=args.workers, ) # calibration model.fc.disable_quantize() model = quantize_qat(model, qconfig=Q.calibration_qconfig) # calculate scale @jit.trace(symbolic=True) def calculate_scale(image, label): model.eval() enable_observer(model) logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # model.fc.disable_quantize() infer(calculate_scale, valid_queue, args) # quantized model = quantize(model) # eval quantized model @jit.trace(symbolic=True) def eval_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 _, valid_acc, valid_acc5 = infer(eval_func, valid_queue, args) logger.info("TEST %f, %f", valid_acc, valid_acc5) # save quantized model mge.save( {"step": -1, "state_dict": model.state_dict()}, os.path.join(save_dir, "checkpoint-calibration.pkl") ) logger.info("save in {}".format(os.path.join(save_dir, "checkpoint-calibration.pkl")))
def main(): from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval parser = make_parser() args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save_dir, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) args.ngpus = (dist.helper.get_device_count_by_fork("gpu") if args.ngpus is None else args.ngpus) cfg.batch_size = cfg.batch_size if args.batch_size is None else args.batch_size dt_path = os.path.join(cfg.data_root, "person_detection_results", args.dt_file) dets = json.load(open(dt_path, "r")) gt_path = os.path.join(cfg.data_root, "annotations", "person_keypoints_val2017.json") eval_gt = COCO(gt_path) gt = eval_gt.dataset dets = [ i for i in dets if (i["image_id"] in eval_gt.imgs and i["category_id"] == 1) ] ann_file = {"images": gt["images"], "annotations": dets} if args.end_epoch == -1: args.end_epoch = args.start_epoch for epoch_num in range(args.start_epoch, args.end_epoch + 1, args.test_freq): if args.model: model_file = args.model else: model_file = "{}/epoch_{}.pkl".format(args.model_dir, epoch_num) logger.info("Load Model : %s completed", model_file) dist_worker = dist.launcher(n_gpus=args.ngpus)(worker) all_results = dist_worker(args.arch, model_file, cfg.data_root, ann_file) all_results = sum(all_results, []) json_name = "log-of-{}_epoch_{}.json".format(args.arch, epoch_num) json_path = os.path.join(save_dir, json_name) all_results = json.dumps(all_results) with open(json_path, "w") as fo: fo.write(all_results) logger.info("Save to %s finished, start evaluation!", json_path) eval_dt = eval_gt.loadRes(json_path) cocoEval = COCOeval(eval_gt, eval_dt, iouType="keypoints") cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() metrics = [ "AP", "[email protected]", "[email protected]", "APm", "APl", "AR", "[email protected]", "[email protected]", "ARm", "ARl", ] logger.info("mmAP".center(32, "-")) for i, m in enumerate(metrics): logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i]) logger.info("-" * 32)