Пример #1
0
def main():
	parser = argparse.ArgumentParser(
		description='FCOS Detector Training With Pytorch')

	parser.add_argument(
		'--dataset-style', type=str, required=True,
		help="style of dataset (supported are 'pascal-voc' and 'coco')")
	parser.add_argument('--dataset', required=True, help='dataset path')
	parser.add_argument(
		'--train-image-set', type=str, default="train",
		help='image set (annotation file basename for COCO) '
		'to use for training')
	parser.add_argument(
		'--val-image-set', type=str, default="val",
		help='image set (annotation file basename for COCO) '
		'to use for validation')
	parser.add_argument(
		'--val-dataset', default=None,
		help='separate validation dataset directory path')

	parser.add_argument(
		'--net-config',
		help="path to network architecture configuration file "
		"(take a look into 'preset' directory for the reference)")

	# Params for optimizer
	parser.add_argument(
		'--optimizer', default="ranger",
		help="optimizer to use ('sgd', 'diffgrad', 'adamw', or 'ranger')")
	parser.add_argument(
		'--lr', '--learning-rate', default=1e-3, type=float,
		help='initial learning rate')
	parser.add_argument(
		'--momentum', default=0.9, type=float,
		help='optional momentum for SGD optimizer (default is 0.9)')
	parser.add_argument(
		'--weight-decay', default=5e-4, type=float,
		help='optional weight decay (L2 penalty) '
		'for SGD optimizer (default is 5e-4)')

	parser.add_argument('--backbone-pretrained', action='store_true')
	parser.add_argument(
		'--backbone-weights',
		help='pretrained weights for the backbone model')
	parser.add_argument('--freeze-backbone', action='store_true')

	# Scheduler
	parser.add_argument(
		'--scheduler', default="cosine-wr", type=str,
		help="scheduler for SGD. It can one of 'multi-step' and 'cosine-wr'")

	# Params for Scheduler
	parser.add_argument(
		'--milestones', default="70,100", type=str,
		help="milestones for MultiStepLR")
	parser.add_argument(
		'--t0', default=10, type=int,
		help='T_0 value for Cosine Annealing Warm Restarts.')
	parser.add_argument(
		'--t-mult', default=2, type=float,
		help='T_mult value for Cosine Annealing Warm Restarts.')

	# Train params
	parser.add_argument('--batch-size', default=32, type=int, help='batch size')
	parser.add_argument(
		'--num-epochs', default=120, type=int, help='number of epochs to train')
	parser.add_argument(
		'--num-workers', default=4, type=int,
		help='number of workers used in dataloading')
	parser.add_argument(
		'--val-epochs', default=5, type=int,
		help='perform validation every this many epochs')
	parser.add_argument(
		'--device', type=str,
		help='device to use for training')

	parser.add_argument(
		'--checkpoint-path', default='output',
		help='directory for saving checkpoint models')


	logging.basicConfig(
		stream=sys.stdout, level=logging.INFO,
		format='%(asctime)s - %(levelname)s - %(message)s')

	args = parser.parse_args()
	logging.info(args)

	if args.device is None:
		device = "cuda" if torch.cuda.is_available() else "cpu"
	else:
		device = args.device

	if device.startswith("cuda"):
		logging.info("Use CUDA")

	timer = Timer()

	arch = get_arch(args.net_config)

	bbox_format = dataset_bbox_format(args.dataset_style)

	train_mean, train_std = mean_std(
		args.dataset_style,
		args.dataset,
		args.train_image_set)

	train_transform = processing.train.Pipeline(
		[arch.image_size] * 2,
		train_mean, train_std,
		bbox_format=bbox_format)

	if args.val_dataset is not None:
		val_dataset_root = args.val_dataset
	else:
		val_dataset_root = args.dataset

	val_mean, val_std = mean_std(
		args.dataset_style,
		val_dataset_root,
		args.val_image_set)

	val_transform = processing.test.Pipeline(
		[arch.image_size] * 2,
		val_mean, val_std,
		bbox_format=bbox_format)

	logging.info("Loading datasets...")

	dataset = load_dataset(
			args.dataset_style,
			args.dataset,
			args.train_image_set,
			train_transform)

	num_classes = len(dataset.class_names)

	logging.info("Train dataset size: {}".format(len(dataset)))

	# don't allow the last batch be of length 1
	# to not lead our dear BatchNorms to crash on that
	drop_last = len(dataset) % args.batch_size > 0

	train_loader = DataLoader(
		dataset, args.batch_size, collate_fn=collate,
		num_workers=args.num_workers,
		shuffle=True, drop_last=drop_last)

	val_dataset = load_dataset(
			args.dataset_style,
			val_dataset_root,
			args.val_image_set,
			val_transform)

	logging.info("Validation dataset size: {}".format(len(val_dataset)))

	val_loader = DataLoader(
		val_dataset, args.batch_size, collate_fn=collate,
		num_workers=args.num_workers,
		shuffle=False, drop_last=drop_last)

	logging.info("Building network")
	backbone_pretrained = args.backbone_pretrained is not None
	net = arch.build(num_classes, backbone_pretrained, args.batch_size)

	if backbone_pretrained and args.backbone_weights is not None:
		logging.info(f"Load backbone weights from {args.backbone_weights}")
		timer.start("Loading backbone model")
		net.load_backbone_weights(args.backbone_weights)
		logging.info(f'Took {timer.end("Loading backbone model"):.2f}s.')

	if args.freeze_backbone:
		net.freeze_backbone()

	net.to(device)

	last_epoch = -1

	criterion = arch.loss(net, device)
	mapper = arch.mapper(net, device)

	optim_kwargs = {
		"lr": args.lr,
		"weight_decay": args.weight_decay
	}

	if args.optimizer == "sgd":
		optim_class = torch.optim.SGD
		optim_kwargs.update({
			"momentum": args.momentum
		})
	elif args.optimizer == "adamw":
		optim_class = torch.optim.AdamW
	elif args.optimizer == "diffgrad":
		optim_class = DiffGrad
	else:
		optim_class = Ranger

	optimizer = optim_class(net.parameters(), **optim_kwargs)
	logging.info(f"Optimizer parameters used: {optim_kwargs}")

	if args.scheduler == 'multi-step':
		logging.info("Uses MultiStepLR scheduler.")
		milestones = [int(v.strip()) for v in args.milestones.split(",")]
		scheduler = MultiStepLR(
			optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch)
	else:
		logging.info("Uses Cosine annealing warm restarts scheduler.")
		scheduler = CosineAnnealingWarmRestarts(
			optimizer, T_0=args.t0, T_mult=args.t_mult, eta_min=1e-5)

	os.makedirs(args.checkpoint_path, exist_ok=True)

	logging.info(f"Start training from epoch {last_epoch + 1}.")
	for epoch in range(last_epoch + 1, args.num_epochs):
		loop(
			train_loader, net, mapper, criterion,
			optimizer, device=device, epoch=epoch)
		scheduler.step()

		if (epoch > 0 and epoch % args.val_epochs == 0 or
				epoch == args.num_epochs - 1):
			val_loss = loop(
				val_loader, net, mapper, criterion,
				device=device, epoch=epoch)

			filename = f"{arch.name}-Epoch-{epoch}-Loss-{val_loss}.pth"
			model_path = os.path.join(args.checkpoint_path, filename)
			save(arch, net, dataset.class_names, model_path)
			logging.info(f"Saved model {model_path}")
Пример #2
0
generator = Generator()
discriminator = Discriminator()

generator_writer.add_graph(generator,
                           [torch.rand([1, 1, 16384], dtype=torch.float32)])
discriminator_writer.add_graph(discriminator, [
    torch.rand([1, 2, 16384], dtype=torch.float32),
    torch.rand([1, 2, 16384], dtype=torch.float32),
])

g_optimizer = Adam(generator.parameters(), cfg=cfg['hparas']['optim'])
d_optimizer = Adam(discriminator.parameters(), cfg=cfg['hparas']['optim'])

g_lr_change = CosineAnnealingWarmRestarts(optimizer=g_optimizer,
                                          T_0=10,
                                          T_mult=2,
                                          eta_min=0,
                                          last_epoch=-1)
d_lr_change = CosineAnnealingWarmRestarts(optimizer=d_optimizer,
                                          T_0=10,
                                          T_mult=2,
                                          eta_min=0,
                                          last_epoch=-1)

if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

if cfg['hparas']['train_continue']:
    print('loading models ...')
    generator.load_state_dict(
        torch.load(
Пример #3
0
    #------------------------------------------------------#
    #   主干特征提取网络特征通用,冻结训练可以加快训练速度
    #   也可以在训练初期防止权值被破坏。
    #   Init_Epoch为起始epoch
    #   Freeze_Epoch为冻结训练的epoch
    #   Epoch总训练epoch
    #   提示OOM或者显存不足请调小Batch_size
    #------------------------------------------------------#

    if True:
        initial_lr = 1e-3
        Init_Epoch = 15
        Freeze_Epoch = 20
        optimizer = optim.SGD(net.parameters(), lr=initial_lr)
        if Cosine_lr:
            lr_scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5)
        else:
            lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.92)

        # 取出放在run中的超参数
        runs = runBuilder.get_runs(parameters)
        print(runs)
        train_set = get_train_set(classes_path, 'train')
        train_loader = DataLoader(train_set,
                                  batch_size=runs[0].batch_size,
                                  num_workers=runs[0].num_workers,
                                  shuffle=runs[0].shuffle)
        print("data successfully loaded!")

        # 冻结一定部分训练
        print("start to freeze the backbone!")
Пример #4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_base_6layer_6conect.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument(
        "--num_train_epochs",
        default=20,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--train_iter_multiplier",
        default=1.0,
        type=float,
        help="multiplier for the multi-task training.",
    )
    parser.add_argument(
        "--train_iter_gap",
        default=4,
        type=int,
        help=
        "forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=4,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument("--save_name",
                        default="",
                        type=str,
                        help="save name for training.")
    parser.add_argument(
        "--in_memory",
        default=False,
        type=bool,
        help="whether use chunck for parallel training.",
    )
    parser.add_argument("--optim",
                        default="AdamW",
                        type=str,
                        help="what to use for the optimization.")
    parser.add_argument("--tasks",
                        default="",
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument(
        "--freeze",
        default=-1,
        type=int,
        help="till which layer of textual stream of vilbert need to fixed.",
    )
    parser.add_argument(
        "--vision_scratch",
        action="store_true",
        help="whether pre-trained the image or not.",
    )
    parser.add_argument("--evaluation_interval",
                        default=1,
                        type=int,
                        help="evaluate very n epoch.")
    parser.add_argument(
        "--lr_scheduler",
        default="mannul",
        type=str,
        help="whether use learning rate scheduler.",
    )
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--resume_file",
                        default="",
                        type=str,
                        help="Resume from checkpoint")
    parser.add_argument(
        "--dynamic_attention",
        action="store_true",
        help="whether use dynamic attention.",
    )
    parser.add_argument(
        "--clean_train_sets",
        default=True,
        type=bool,
        help="whether clean train sets for multitask data.",
    )
    parser.add_argument(
        "--visual_target",
        default=0,
        type=int,
        help="which target to use for visual branch. \
        0: soft label, \
        1: regress the feature, \
        2: NCE loss.",
    )
    parser.add_argument(
        "--task_specific_tokens",
        action="store_true",
        help="whether to use task specific tokens for the multi-task learning.",
    )

    args = parser.parse_args()
    with open("vilbert_tasks.yml", "r") as f:
        task_cfg = edict(yaml.safe_load(f))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_transformers.modeling_bert import BertConfig
        from vilbert.basebert import BaseBertForVLTasks
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    task_names = []
    task_lr = []
    for i, task_id in enumerate(args.tasks.split("-")):
        task = "TASK" + task_id
        name = task_cfg[task]["name"]
        task_names.append(name)
        task_lr.append(task_cfg[task]["lr"])

    base_lr = min(task_lr)
    loss_scale = {}
    for i, task_id in enumerate(args.tasks.split("-")):
        task = "TASK" + task_id
        loss_scale[task] = task_lr[i] / base_lr

    if args.save_name:
        prefix = "-" + args.save_name
    else:
        prefix = ""
    timeStamp = ("-".join(task_names) + "_" +
                 args.config_file.split("/")[1].split(".")[0] + prefix)
    savePath = os.path.join(args.output_dir, timeStamp)

    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu:
        if not os.path.exists(savePath):
            os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)
    if default_gpu:
        # save all the hidden parameters.
        with open(os.path.join(savePath, "command.txt"), "w") as f:
            print(args, file=f)  # Python 3.x
            print("\n", file=f)
            print(config, file=f)

    task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets(
        args, task_cfg, args.tasks.split("-"))

    logdir = os.path.join(savePath, "logs")
    tbLogger = utils.tbLogger(
        logdir,
        savePath,
        task_names,
        task_ids,
        task_num_iters,
        args.gradient_accumulation_steps,
    )

    if args.visual_target == 0:
        config.v_target_size = 1601
        config.visual_target = args.visual_target
    else:
        config.v_target_size = 2048
        config.visual_target = args.visual_target

    if args.task_specific_tokens:
        config.task_specific_tokens = True

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_ave_iter = {}
    task_stop_controller = {}
    for task_id, num_iter in task_num_iters.items():
        task_ave_iter[task_id] = int(task_cfg[task]["num_epoch"] * num_iter *
                                     args.train_iter_multiplier /
                                     args.num_train_epochs)
        task_stop_controller[task_id] = utils.MultiTaskStopOnPlateau(
            mode="max",
            patience=1,
            continue_threshold=0.005,
            cooldown=1,
            threshold=0.001,
        )

    task_ave_iter_list = sorted(task_ave_iter.values())
    median_num_iter = task_ave_iter_list[-1]
    num_train_optimization_steps = (median_num_iter * args.num_train_epochs //
                                    args.gradient_accumulation_steps)
    num_labels = max(
        [dataset.num_labels for dataset in task_datasets_train.values()])

    if args.dynamic_attention:
        config.dynamic_attention = True
    if "roberta" in args.bert_model:
        config.model = "roberta"

    if args.baseline:
        model = BaseBertForVLTasks.from_pretrained(
            args.from_pretrained,
            config=config,
            num_labels=num_labels,
            default_gpu=default_gpu,
        )
    else:
        model = VILBertForVLTasks.from_pretrained(
            args.from_pretrained,
            config=config,
            num_labels=num_labels,
            default_gpu=default_gpu,
        )

    task_losses = LoadLosses(args, task_cfg, args.tasks.split("-"))

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    if args.freeze != -1:
        bert_weight_name_filtered = []
        for name in bert_weight_name:
            if "embeddings" in name:
                bert_weight_name_filtered.append(name)
            elif "encoder" in name:
                layer_num = name.split(".")[2]
                if int(layer_num) <= args.freeze:
                    bert_weight_name_filtered.append(name)

        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if key[12:] in bert_weight_name_filtered:
                value.requires_grad = False

        if default_gpu:
            print("filtered weight")
            print(bert_weight_name_filtered)

    optimizer_grouped_parameters = []
    for key, value in dict(model.named_parameters()).items():
        if value.requires_grad:
            if "vil_" in key:
                lr = 1e-4
            else:
                if args.vision_scratch:
                    if key[12:] in bert_weight_name:
                        lr = base_lr
                    else:
                        lr = 1e-4
                else:
                    lr = base_lr
            if any(nd in key for nd in no_decay):
                optimizer_grouped_parameters += [{
                    "params": [value],
                    "lr": lr,
                    "weight_decay": 0.0
                }]
            if not any(nd in key for nd in no_decay):
                optimizer_grouped_parameters += [{
                    "params": [value],
                    "lr": lr,
                    "weight_decay": 0.01
                }]

    if default_gpu:
        print(len(list(model.named_parameters())),
              len(optimizer_grouped_parameters))

    if args.optim == "AdamW":
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=base_lr,
                          correct_bias=False)
    elif args.optim == "RAdam":
        optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr)

    warmpu_steps = args.warmup_proportion * num_train_optimization_steps

    if args.lr_scheduler == "warmup_linear":
        warmup_scheduler = WarmupLinearSchedule(
            optimizer,
            warmup_steps=warmpu_steps,
            t_total=num_train_optimization_steps)
    else:
        warmup_scheduler = WarmupConstantSchedule(optimizer,
                                                  warmup_steps=warmpu_steps)

    lr_reduce_list = np.array([5, 7])
    if args.lr_scheduler == "automatic":
        lr_scheduler = ReduceLROnPlateau(optimizer,
                                         mode="max",
                                         factor=0.2,
                                         patience=1,
                                         cooldown=1,
                                         threshold=0.001)
    elif args.lr_scheduler == "cosine":
        lr_scheduler = CosineAnnealingLR(optimizer,
                                         T_max=median_num_iter *
                                         args.num_train_epochs)
    elif args.lr_scheduler == "cosine_warm":
        lr_scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                   T_0=median_num_iter *
                                                   args.num_train_epochs)
    elif args.lr_scheduler == "mannul":

        def lr_lambda_fun(epoch):
            return pow(0.2, np.sum(lr_reduce_list <= epoch))

        lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun)

    startIterID = 0
    global_step = 0
    start_epoch = 0

    if args.resume_file != "" and os.path.exists(args.resume_file):
        checkpoint = torch.load(args.resume_file, map_location="cpu")
        new_dict = {}
        for attr in checkpoint["model_state_dict"]:
            if attr.startswith("module."):
                new_dict[attr.replace(
                    "module.", "", 1)] = checkpoint["model_state_dict"][attr]
            else:
                new_dict[attr] = checkpoint["model_state_dict"][attr]
        model.load_state_dict(new_dict)
        warmup_scheduler.load_state_dict(
            checkpoint["warmup_scheduler_state_dict"])
        # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict'])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        global_step = checkpoint["global_step"]
        start_epoch = int(checkpoint["epoch_id"]) + 1
        task_stop_controller = checkpoint["task_stop_controller"]
        tbLogger = checkpoint["tb_logger"]
        del checkpoint

    model.to(device)

    for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                state[k] = v.cuda()

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, delay_allreduce=True)

    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if default_gpu:
        print("***** Running training *****")
        print("  Num Iters: ", task_num_iters)
        print("  Batch size: ", task_batch_size)
        print("  Num steps: %d" % num_train_optimization_steps)

    task_iter_train = {name: None for name in task_ids}
    task_count = {name: 0 for name in task_ids}
    for epochId in tqdm(range(start_epoch, args.num_train_epochs),
                        desc="Epoch"):
        model.train()
        for step in range(median_num_iter):
            iterId = startIterID + step + (epochId * median_num_iter)
            first_task = True
            for task_id in task_ids:
                is_forward = False
                if (not task_stop_controller[task_id].in_stop) or (
                        iterId % args.train_iter_gap == 0):
                    is_forward = True

                if is_forward:
                    loss, score = ForwardModelsTrain(
                        args,
                        task_cfg,
                        device,
                        task_id,
                        task_count,
                        task_iter_train,
                        task_dataloader_train,
                        model,
                        task_losses,
                    )

                    loss = loss * loss_scale[task_id]
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    loss.backward()
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        if args.fp16:
                            lr_this_step = args.learning_rate * warmup_linear(
                                global_step / num_train_optimization_steps,
                                args.warmup_proportion,
                            )
                            for param_group in optimizer.param_groups:
                                param_group["lr"] = lr_this_step

                        if first_task and (global_step < warmpu_steps
                                           or args.lr_scheduler
                                           == "warmup_linear"):
                            warmup_scheduler.step()

                        optimizer.step()
                        model.zero_grad()
                        if first_task:
                            global_step += 1
                            first_task = False

                        if default_gpu:
                            tbLogger.step_train(
                                epochId,
                                iterId,
                                float(loss),
                                float(score),
                                optimizer.param_groups[0]["lr"],
                                task_id,
                                "train",
                            )

            if "cosine" in args.lr_scheduler and global_step > warmpu_steps:
                lr_scheduler.step()

            if (step % (20 * args.gradient_accumulation_steps) == 0
                    and step != 0 and default_gpu):
                tbLogger.showLossTrain()

            # decided whether to evaluate on each tasks.
            for task_id in task_ids:
                if (iterId != 0 and iterId % task_num_iters[task_id]
                        == 0) or (epochId == args.num_train_epochs - 1
                                  and step == median_num_iter - 1):
                    evaluate(
                        args,
                        task_dataloader_val,
                        task_stop_controller,
                        task_cfg,
                        device,
                        task_id,
                        model,
                        task_losses,
                        epochId,
                        default_gpu,
                        tbLogger,
                    )

        if args.lr_scheduler == "automatic":
            lr_scheduler.step(sum(val_scores.values()))
            logger.info("best average score is %3f" % lr_scheduler.best)
        elif args.lr_scheduler == "mannul":
            lr_scheduler.step()

        if epochId in lr_reduce_list:
            for task_id in task_ids:
                # reset the task_stop_controller once the lr drop
                task_stop_controller[task_id]._reset()

        if default_gpu:
            # Save a trained model
            logger.info("** ** * Saving fine - tuned model ** ** * ")
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Only save the model it-self
            output_model_file = os.path.join(
                savePath, "pytorch_model_" + str(epochId) + ".bin")
            output_checkpoint = os.path.join(savePath,
                                             "pytorch_ckpt_latest.tar")
            torch.save(model_to_save.state_dict(), output_model_file)
            torch.save(
                {
                    "model_state_dict": model_to_save.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "warmup_scheduler_state_dict":
                    warmup_scheduler.state_dict(),
                    # 'lr_scheduler_state_dict': lr_scheduler.state_dict(),
                    "global_step": global_step,
                    "epoch_id": epochId,
                    "task_stop_controller": task_stop_controller,
                    "tb_logger": tbLogger,
                },
                output_checkpoint,
            )
    tbLogger.txt_close()
Пример #5
0
def run_training(data_type="screw",
                 model_dir="models",
                 epochs=256,
                 pretrained=True,
                 test_epochs=10,
                 freeze_resnet=20,
                 learninig_rate=0.03,
                 optim_name="SGD",
                 batch_size=64,
                 head_layer=8):
    torch.multiprocessing.freeze_support()
    # TODO: use script params for hyperparameter
    # Temperature Hyperparameter currently not used
    temperature = 0.2
    device = "cuda"

    weight_decay = 0.00003
    momentum = 0.9
    #TODO: use f strings also for the date LOL
    model_name = f"model-{data_type}" + '-{date:%Y-%m-%d_%H_%M_%S}'.format(
        date=datetime.datetime.now())

    #augmentation:
    size = 256
    min_scale = 0.5

    # create Training Dataset and Dataloader
    after_cutpaste_transform = transforms.Compose([])
    after_cutpaste_transform.transforms.append(transforms.ToTensor())
    after_cutpaste_transform.transforms.append(
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]))

    train_transform = transforms.Compose([])
    # train_transform.transforms.append(transforms.RandomResizedCrop(size, scale=(min_scale,1)))
    # train_transform.transforms.append(transforms.GaussianBlur(int(size/10), sigma=(0.1,2.0)))
    train_transform.transforms.append(transforms.Resize((256, 256)))
    train_transform.transforms.append(
        CutPaste(transform=after_cutpaste_transform))
    # train_transform.transforms.append(transforms.ToTensor())

    train_data = MVTecAT("Data",
                         data_type,
                         transform=train_transform,
                         size=int(size * (1 / min_scale)))
    dataloader = DataLoader(train_data,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=8,
                            collate_fn=cut_paste_collate_fn,
                            persistent_workers=True,
                            pin_memory=True,
                            prefetch_factor=5)

    # Writer will output to ./runs/ directory by default
    writer = SummaryWriter(Path("logdirs") / model_name)

    # create Model:
    head_layers = [512] * head_layer + [128]
    print(head_layers)
    model = ProjectionNet(pretrained=pretrained, head_layers=head_layers)
    model.to(device)

    if freeze_resnet > 0:
        model.freeze_resnet()

    loss_fn = torch.nn.CrossEntropyLoss()
    if optim_name == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=learninig_rate,
                              momentum=momentum,
                              weight_decay=weight_decay)
        scheduler = CosineAnnealingWarmRestarts(optimizer, epochs)
        #scheduler = None
    elif optim_name == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=learninig_rate,
                               weight_decay=weight_decay)
        scheduler = None
    else:
        print(f"ERROR unkown optimizer: {optim_name}")

    step = 0
    import torch.autograd.profiler as profiler
    num_batches = len(dataloader)

    def get_data_inf():
        while True:
            for out in enumerate(dataloader):
                yield out

    dataloader_inf = get_data_inf()
    # From paper: "Note that, unlike conventional definition for an epoch,
    #              we define 256 parameter update steps as one epoch.
    for step in tqdm(range(epochs * 256)):
        epoch = int(step / 256)
        if epoch == freeze_resnet:
            model.unfreeze()

        batch_embeds = []
        batch_idx, data = next(dataloader_inf)
        x1, x2 = data
        x1 = x1.to(device)
        x2 = x2.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        xc = torch.cat((x1, x2), axis=0)
        embeds, logits = model(xc)

        #         embeds = F.normalize(embeds, p=2, dim=1)
        #         embeds1, embeds2 = torch.split(embeds,x1.size(0),dim=0)
        #         ip = torch.matmul(embeds1, embeds2.T)
        #         ip = ip / temperature

        #         y = torch.arange(0,x1.size(0), device=device)
        #         loss = loss_fn(ip, torch.arange(0,x1.size(0), device=device))

        y = torch.tensor([0, 1], device=device)
        y = y.repeat_interleave(x1.size(0))
        loss = loss_fn(logits, y)

        # regulize weights:
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step(epoch + batch_idx / num_batches)

        writer.add_scalar('loss', loss.item(), step)

        #         predicted = torch.argmax(ip,axis=0)
        predicted = torch.argmax(logits, axis=1)
        #         print(logits)
        #         print(predicted)
        #         print(y)
        accuracy = torch.true_divide(torch.sum(predicted == y),
                                     predicted.size(0))
        writer.add_scalar('acc', accuracy, step)
        if scheduler is not None:
            writer.add_scalar('lr', scheduler.get_last_lr()[0], step)

        # save embed for validation:
        if test_epochs > 0 and epoch % test_epochs == 0:
            batch_embeds.append(embeds.cpu().detach())

        writer.add_scalar('epoch', epoch, step)

        # run tests
        if test_epochs > 0 and epoch % test_epochs == 0:
            # run auc calculation
            #TODO: create dataset only once.
            #TODO: train predictor here or in the model class itself. Should not be in the eval part
            #TODO: we might not want to use the training datat because of droupout etc. but it should give a indecation of the model performance???
            # batch_embeds = torch.cat(batch_embeds)
            # print(batch_embeds.shape)
            model.eval()
            roc_auc = eval_model(model_name,
                                 data_type,
                                 device=device,
                                 save_plots=False,
                                 size=size,
                                 show_training_data=False,
                                 model=model)
            #train_embed=batch_embeds)
            model.train()
            writer.add_scalar('eval_auc', roc_auc, step)

    torch.save(model.state_dict(), model_dir / f"{model_name}.tch")
Пример #6
0
def main():
    args = get_args()

    # archLoader
    arch_loader = ArchLoader(args.path)

    # Log
    log_format = '[%(asctime)s] %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m-%d %I:%M:%S')
    t = time.time()
    local_time = time.localtime(t)
    if not os.path.exists('./log'):
        os.mkdir('./log')
    fh = logging.FileHandler(
        os.path.join('log/train-{}-{:02}-{:02}-{:.3f}'.format(
            local_time.tm_year % 2000, local_time.tm_mon, local_time.tm_mday,
            t)))
    fh.setFormatter(logging.Formatter(log_format))
    logging.getLogger().addHandler(fh)

    use_gpu = False
    if torch.cuda.is_available():
        use_gpu = True

    train_dataset, val_dataset = get_dataset('cifar100')

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.num_workers,
                                             pin_memory=True)

    model = mutableResNet20()

    logging.info('load model successfully')

    optimizer = torch.optim.SGD(get_parameters(model),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1)

    if use_gpu:
        model = nn.DataParallel(model)
        loss_function = criterion_smooth.cuda()
        device = torch.device("cuda")
    else:
        loss_function = criterion_smooth
        device = torch.device("cpu")

    # scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
    #                                               lambda step: (1.0-step/args.total_iters) if step <= args.total_iters else 0, last_epoch=-1)
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    #     optimizer, T_max=200)

    model = model.to(device)

    all_iters = 0

    if args.auto_continue:  # 自动进行??
        lastest_model, iters = get_lastest_model()
        if lastest_model is not None:
            all_iters = iters
            checkpoint = torch.load(lastest_model,
                                    map_location=None if use_gpu else 'cpu')
            model.load_state_dict(checkpoint['state_dict'], strict=True)
            logging.info('load from checkpoint')
            for i in range(iters):
                scheduler.step()

    # 参数设置
    args.optimizer = optimizer
    args.loss_function = loss_function
    args.scheduler = scheduler
    args.train_loader = train_loader
    args.val_loader = val_loader

    if args.eval:
        if args.eval_resume is not None:
            checkpoint = torch.load(args.eval_resume,
                                    map_location=None if use_gpu else 'cpu')
            model.load_state_dict(checkpoint, strict=True)
            validate(model,
                     device,
                     args,
                     all_iters=all_iters,
                     arch_loader=arch_loader)
        exit(0)

    # warmup weights
    if args.warmup > 0:
        logging.info("begin warmup weights")
        while all_iters < args.warmup:
            all_iters = train_supernet(model,
                                       device,
                                       args,
                                       bn_process=False,
                                       all_iters=all_iters)

        validate(model,
                 device,
                 args,
                 all_iters=all_iters,
                 arch_loader=arch_loader)

    while all_iters < args.total_iters:
        logging.info("=" * 50)
        all_iters = train_subnet(model,
                                 device,
                                 args,
                                 bn_process=False,
                                 all_iters=all_iters,
                                 arch_loader=arch_loader)

        if all_iters % 200 == 0:
            logging.info("validate iter {}".format(all_iters))

            validate(model,
                     device,
                     args,
                     all_iters=all_iters,
                     arch_loader=arch_loader)
Пример #7
0
class Learner:
    def __init__(self, model, train_loader, valid_loader, config):
        self.config = config
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.model = model.to(self.config.device)

        self.logger = init_logger(self.config.log_dir, 'train_main.log')
        self.tb_logger = init_tb_logger(self.config.log_dir, 'train_main')
        self.log('\n'.join(
            [f"{k} = {v}" for k, v in self.config.__dict__.items()]))

        self.summary_loss = AverageMeter()
        self.evaluator = Evaluator()

        self.criterion = torch.nn.CrossEntropyLoss(
            ignore_index=self.config.ignore_index)
        self.u_criterion = torch.nn.CrossEntropyLoss(
            ignore_index=self.config.ignore_index)
        train_params = [{
            'params': getattr(model, 'encoder').parameters(),
            'lr': self.config.lr
        }, {
            'params': getattr(model, 'decoder').parameters(),
            'lr': self.config.lr * 10
        }]
        self.optimizer = RAdam(train_params,
                               weight_decay=self.config.weight_decay)

        self.scheduler = CosineAnnealingWarmRestarts(self.optimizer,
                                                     T_0=2,
                                                     T_mult=2,
                                                     eta_min=1e-6)

        self.n_ensemble = 0
        self.epoch = 0
        self.best_epoch = 0
        self.best_loss = np.inf
        self.best_score = -np.inf

    def train_one_epoch(self):
        self.model.train()
        self.summary_loss.reset()
        iters = len(self.train_loader)
        for step, (images, scribbles, weights) in enumerate(self.train_loader):
            self.tb_logger.add_scalar('Train/lr',
                                      self.optimizer.param_groups[0]['lr'],
                                      iters * self.epoch + step)
            scribbles = scribbles.to(self.config.device).long()
            images = images.to(self.config.device)
            batch_size = images.shape[0]

            self.optimizer.zero_grad()
            outputs = self.model(images)
            if self.epoch < self.config.thr_epoch:
                loss = self.criterion(outputs, scribbles)
            else:
                x_loss = self.criterion(outputs, scribbles)

                scribbles = scribbles.cpu()
                mean = weights[..., 0]
                u_labels = torch.where(
                    ((mean < (1 - self.config.thr_conf)) |
                     (mean > self.config.thr_conf)) &
                    (scribbles == self.config.ignore_index),
                    mean.round().long(),
                    self.config.ignore_index * torch.ones_like(scribbles)).to(
                        self.config.device)
                u_loss = self.u_criterion(outputs, u_labels)
                loss = x_loss + 0.5 * u_loss

            loss.backward()
            self.summary_loss.update(loss.detach().item(), batch_size)
            self.optimizer.step()
            if self.scheduler.__class__.__name__ != 'ReduceLROnPlateau':
                self.scheduler.step()

        return self.summary_loss.avg

    def validation(self):
        self.model.eval()
        self.summary_loss.reset()
        self.evaluator.reset()
        for step, (_, images, _, targets) in enumerate(self.valid_loader):
            with torch.no_grad():
                targets = targets.to(self.config.device).long()
                batch_size = images.shape[0]
                images = images.to(self.config.device)
                outputs = self.model(images)
                loss = self.criterion(outputs, targets)

                targets = targets.cpu().numpy()
                outputs = torch.argmax(outputs, dim=1)
                outputs = outputs.data.cpu().numpy()
                self.evaluator.add_batch(targets, outputs)
                self.summary_loss.update(loss.detach().item(), batch_size)

        if self.scheduler.__class__.__name__ == 'ReduceLROnPlateau':
            self.scheduler.step(self.evaluator.IoU)
        return self.summary_loss.avg, self.evaluator.IoU

    def ensemble_prediction(self):
        ds = self.train_loader.dataset
        transforms = Compose([Normalize(), ToTensorV2()])
        for idx, images in tqdm(ds.images.items(), total=len(ds)):
            augmented = transforms(image=images['image'])
            img = augmented['image'].unsqueeze(0).to(self.config.device)
            with torch.no_grad():
                pred = torch.nn.functional.softmax(self.model(img), dim=1)
            weight = torch.tensor(images['weight'])
            pred = pred.squeeze(0).cpu()
            x = pred[1]
            weight[..., 0] = self.config.alpha * x + (
                1 - self.config.alpha) * weight[..., 0]
            self.train_loader.dataset.images[idx]['weight'] = weight.numpy()
        self.n_ensemble += 1

    def fit(self, epochs):
        for e in range(epochs):
            t = time.time()
            loss = self.train_one_epoch()

            self.log(
                f'[Train] \t Epoch: {self.epoch}, loss: {loss:.5f}, time: {(time.time() - t):.2f}'
            )
            self.tb_log(loss, None, 'Train', self.epoch)

            t = time.time()
            loss, score = self.validation()

            self.log(
                f'[Valid] \t Epoch: {self.epoch}, loss: {loss:.5f}, IoU: {score:.4f}, time: {(time.time() - t):.2f}'
            )
            self.tb_log(loss, score, 'Valid', self.epoch)
            self.post_processing(loss, score)

            if (self.epoch + 1) % self.config.period_epoch == 0:
                self.log(
                    f'[Ensemble] \t the {self.n_ensemble}th Prediction Ensemble ...'
                )
                self.ensemble_prediction()

            self.epoch += 1
        self.log(
            f'best epoch: {self.best_epoch}, best loss: {self.best_loss}, best_score: {self.best_score}'
        )

    def post_processing(self, loss, score):
        if loss < self.best_loss:
            self.best_loss = loss

        if score > self.best_score:
            self.best_score = score
            self.best_epoch = self.epoch

            self.model.eval()
            torch.save(
                {
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'scheduler_state_dict': self.scheduler.state_dict(),
                    'best_score': self.best_score,
                    'epoch': self.epoch,
                }, f'{os.path.join(self.config.log_dir, "best_model.pth")}')
            self.log(f'best model: {self.epoch} epoch - {score:.4f}')

    def load(self, path):
        checkpoint = torch.load(path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.best_score = checkpoint['best_score']
        self.epoch = checkpoint['epoch'] + 1

    def log(self, text):
        self.logger.info(text)

    def tb_log(self, loss, IoU, split, step):
        if loss: self.tb_logger.add_scalar(f'{split}/Loss', loss, step)
        if IoU: self.tb_logger.add_scalar(f'{split}/IoU', IoU, step)
class CreateModel(nn.Module):
    def __init__(self, args, class_num):
        super(CreateModel, self).__init__()
        self.args = args
        self.feature_dim = args.feature_dim
        self.device = args.device
        self.gpu_ids = args.gpu_ids
        ## Backbone
        if 'spherenet' in args.backbone:
            num_layers = int(args.backbone.split('spherenet')[-1])
            self.backbone = getattr(networks, 'spherenet')(
                num_layers, args.feature_dim, args.image_size,
                args.double_depth, args.use_batchnorm, args.use_pool,
                args.use_dropout)
        elif 'mobilenet' in args.backbone:
            self.backbone = getattr(networks, 'MobileNetV2')(args.feature_dim)
        else:
            self.backbone = getattr(networks, args.backbone)(args.feature_dim,
                                                             args.use_pool,
                                                             args.use_dropout)
        self.backbone.to(self.device)

        ## Objective function
        self.criterion = getattr(losses, self.args.loss_type)
        self.criterion = self.criterion(class_num, self.args)
        self.criterion.to(self.device)

        self.model_names = ['backbone', 'criterion']
        self.state_names = ['loss_ce', 'acc', 'lr']

    def train_setup(self):
        ## Setup nn.DataParallel if necessary
        if self.device.type != 'cpu':
            if len(self.gpu_ids) > 1:
                self.backbone = nn.DataParallel(self.backbone)

        ## Setup optimizer
        self.lr = self.args.lr
        self.save_dir = os.path.join(self.args.checkpoints_dir, self.args.name)
        params = list(self.backbone.parameters()) + list(
            self.criterion.parameters())
        self.optimizer = optim.SGD(params,
                                   lr=self.args.lr,
                                   momentum=0.9,
                                   weight_decay=5e-4)
        #         self.scheduler = MultiStepLR(self.optimizer, milestones=self.args.decay_steps, gamma=0.5)
        #         self.scheduler = CosineAnnealingLR(self.optimizer, self.args.epochs)
        self.scheduler = CosineAnnealingWarmRestarts(self.optimizer, 20, 1)

        ## Weight initialization
        self.backbone.apply(weights_init)
        self.criterion.apply(weights_init)

        ## Switch to training mode
        self.train()

    def update_learning_rate(self):
        self.scheduler.step()
        self.lr = self.optimizer.param_groups[0]['lr']

    def optimize_parameters(self, input, target):
        # input, target = data
        input, target = input.to(self.device), target.to(self.device)
        self.score, self.loss_ce = self.forward(input, target)
        self.optimizer.zero_grad()
        self.loss_ce.backward()
        self.optimizer.step()

        _, pred_labels = torch.max(F.softmax(self.score, dim=1), 1)
        self.acc = torch.sum(torch.eq(pred_labels,
                                      target.view(-1))).item() / len(target)

    def get_current_states(self):
        errors_ret = OrderedDict()
        for name in self.state_names:
            if isinstance(name, str):
                # float(...) works for both scalar tensor and float number
                errors_ret[name] = float(getattr(self, name))
        return errors_ret

    def save_networks(self, which_epoch):
        for name in self.model_names:
            if isinstance(name, str):
                save_filename = '%s_net_%s.pth' % (which_epoch, name)
                save_path = os.path.join(self.save_dir, save_filename)
                net = getattr(self, name)

                if self.gpu_ids and torch.cuda.is_available():
                    try:
                        torch.save(net.module.cpu().state_dict(), save_path)
                    except:
                        torch.save(net.cpu().state_dict(), save_path)
                else:
                    torch.save(net.cpu().state_dict(), save_path)
            net.to(self.device)

    def forward(self, input, target=None, is_feature=False):
        features = self.backbone(input)
        if is_feature:
            return features
        else:
            return self.criterion(features, target)

    def eval(self):
        for name in self.model_names:
            try:
                if isinstance(name, str):
                    getattr(self, name).eval()
            except:
                print('{}.eval() cannot be implemented as {} does not exist.'.
                      format(name, name))

    def train(self):
        for name in self.model_names:
            try:
                if isinstance(name, str):
                    getattr(self, name).train()
            except:
                print('{}.train() cannot be implemented as {} does not exist.'.
                      format(name, name))
Пример #9
0
def main():
    args = get_args()
    num_gpus = torch.cuda.device_count()
    args.gpu = args.local_rank % num_gpus
    torch.cuda.set_device(args.gpu)

    torch.distributed.init_process_group(backend='nccl', init_method='env://')
    args.world_size = torch.distributed.get_world_size()
    args.batch_size = args.batch_size // args.world_size

    # archLoader
    arch_loader = ArchLoader(args.path)

    # Log
    log_format = '[%(asctime)s] %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m-%d %I:%M:%S')
    t = time.time()
    local_time = time.localtime(t)
    if not os.path.exists('./log'):
        os.mkdir('./log')
    fh = logging.FileHandler(
        os.path.join('log/train-{}-{:02}-{:02}-{:.3f}'.format(
            local_time.tm_year % 2000, local_time.tm_mon, local_time.tm_mday,
            t)))
    fh.setFormatter(logging.Formatter(log_format))
    logging.getLogger().addHandler(fh)

    use_gpu = False
    if torch.cuda.is_available():
        use_gpu = True

    train_loader = get_train_loader(args.batch_size, args.local_rank,
                                    args.num_workers, args.total_iters)

    val_loader = get_val_loader(args.batch_size, args.num_workers)

    model = mutableResNet20()

    logging.info('load model successfully')

    optimizer = torch.optim.SGD(get_parameters(model),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1)

    if use_gpu:
        # model = nn.DataParallel(model)
        model = model.cuda(args.gpu)
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
        loss_function = criterion_smooth.cuda()
    else:
        loss_function = criterion_smooth

    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5)

    all_iters = 0

    if args.auto_continue:  # 自动进行??
        lastest_model, iters = get_lastest_model()
        if lastest_model is not None:
            all_iters = iters
            checkpoint = torch.load(lastest_model,
                                    map_location=None if use_gpu else 'cpu')
            model.load_state_dict(checkpoint['state_dict'], strict=True)
            logging.info('load from checkpoint')
            for i in range(iters):
                scheduler.step()

    # 参数设置
    args.optimizer = optimizer
    args.loss_function = loss_function
    args.scheduler = scheduler
    args.train_loader = train_loader
    args.val_loader = val_loader

    if args.eval:
        if args.eval_resume is not None:
            checkpoint = torch.load(args.eval_resume,
                                    map_location=None if use_gpu else 'cpu')
            model.load_state_dict(checkpoint, strict=True)
            validate(model, args, all_iters=all_iters, arch_loader=arch_loader)
        exit(0)

    # warmup weights
    if args.warmup > 0:
        logging.info("begin warmup weights")
        while all_iters < args.warmup:
            all_iters = train_supernet(model,
                                       args,
                                       bn_process=False,
                                       all_iters=all_iters)

        validate(model, args, all_iters=all_iters, arch_loader=arch_loader)

    while all_iters < args.total_iters:
        logging.info("=" * 50)
        all_iters = train_subnet(model,
                                 args,
                                 bn_process=False,
                                 all_iters=all_iters,
                                 arch_loader=arch_loader)

        if all_iters % 200 == 0 and args.local_rank == 0:
            logging.info("validate iter {}".format(all_iters))

            validate(model, args, all_iters=all_iters, arch_loader=arch_loader)
Пример #10
0
        freeze_model.load_state_dict(torch.load(pretrained_weights),
                                     strict=False)

        # #初始化优化器
        fre_optimizer = torch.optim.Adam(freeze_model.parameters(),
                                         lr=warmup_lr,
                                         weight_decay=TRAIN["WEIGHT_DECAY"])

        # #学习率调整策略:余弦退火
        if freeze_lr == 'cosineAnn':
            fre_scheduler = CosineAnnealingLR(fre_optimizer,
                                              T_max=5,
                                              eta_min=0)
        elif freeze_lr == 'cosineAnnWarm':
            fre_scheduler = CosineAnnealingWarmRestarts(fre_optimizer,
                                                        T_0=freeze_epochs,
                                                        T_mult=1)
        elif freeze_lr == 'steplr':
            fre_scheduler = StepLR(
                fre_optimizer,
                step_size=(freeze_epochs * (len(frozen_dataloader) - 2)) // 2,
                gamma=0.1)

        for epoch in range(freeze_epochs):

            # mloss = torch.zeros(1).to(device)
            mloss = 0.
            val_loss = 0.

            freeze_model.train()
            start_time = time.time()
Пример #11
0
                                      weight_decay=args.weight_decay)

        if args.find_lr:
            lr_finder = LRFinder(model, optimizer, criterion, device=device)
            lr_finder.range_test(trn_loader,
                                 start_lr=args.start_lr,
                                 end_lr=args.end_lr,
                                 num_iter=100,
                                 accumulation_steps=args.accum_iter)
            fig_name = 'lr_curve.png'
            lr_finder.plot(fig_name)
            lr_finder.reset()
            break

        scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                T_0=epochs,
                                                T_mult=1,
                                                eta_min=1e-6)
        scaler = GradScaler()
        for epoch in range(epochs):
            train_one_epoch(fold,
                            epoch,
                            model,
                            criterion,
                            optimizer,
                            trn_loader,
                            device,
                            scheduler=scheduler)
            valid_one_epoch(fold,
                            epoch,
                            model,
                            criterion,
Пример #12
0
class model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3)

    def forward(self, x):
        pass


net = model()

optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
# scheduler = StepLR(optimizer, initial_lr, total_epoch)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5)
# scheduler = LambdaLR(optimizer, lambda step : (1.0-step/total_epoch), last_epoch=-1)
# scheduler = CosineAnnealingWarmRestarts(optimizer,T_0=5,T_mult=2)
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
#                                               lambda step: (1.0-step/total_epoch) if step <= total_epoch else 0, last_epoch=-1)
print("初始化的学习率:", optimizer.defaults['lr'])

lr_list = []  # 把使用过的lr都保存下来,之后画出它的变化

for epoch in range(1, total_epoch):
    optimizer.zero_grad()
    optimizer.step()
    print("第%d个epoch的学习率:%f" % (epoch, optimizer.param_groups[0]['lr']))
    print(scheduler.get_lr())
    lr_list.append(optimizer.param_groups[0]['lr'])
    # lr_list.append(scheduler.get_last_lr()[0])
class WarmRestartsCustomScheduler(_LRScheduler):
    """Custom Learning Rate Scheduler based on the 3rd Place Solution.

    This is for setting the learning rate schedule:
    Warm Restarts for epochs (1-28)
    LR=1e-5 (29-32), LR=1e-6 (33-35)

    The general version looks like this:
    # from:
    # https://github.com/naivelamb/kaggle-cloud-organization/blob/master/main_seg.py
    if epoch < start_epoch + n_epochs - 1:
        if epoch != 0:
            scheduler.step()
            scheduler=warm_restart(scheduler, T_mult=2)
    elif (epoch < start_epoch + n_epochs + 2 and
          epoch >= start_epoch + n_epochs - 1):
        optimizer.param_groups[0]['lr'] = 1e-5
    else:
        optimizer.param_groups[0]['lr'] = 5e-6

    """
    def __init__(self,
                 optimizer,
                 T_0,
                 T_mult=2,
                 eta_min=0,
                 num_wr_epochs=28,
                 mid_const_lr_epochs_range=[29, 32],
                 constant_lrs=[1e-5, 5e-6],
                 last_epoch=-1):
        """
        Args:
            optimizer (torch.optim.Optimizer):
            T_0:
            T_mult:
            eta_min:
            num_wr_epochs (int): The number of warm restart epochs to do
            mid_const_lr_epochs_range (list-like[int]): [min, max] where max
                is not included. This is the epoch interval where the first
                lr of constant_lr is used
            constant_lrs (list-like[float]): the learning rates to use for the
                mid and end intervals after warm restarts ends.
        """
        self.num_wr_epochs = num_wr_epochs
        assert len(mid_const_lr_epochs_range) == 2, \
            "`constant_lrs` must be a list-like with length 2."
        self.mid_const_lr_epochs_range = mid_const_lr_epochs_range
        assert len(constant_lrs) == 2, \
            "`constant_lrs` must be a list-like with length 2."
        self.constant_lrs = constant_lrs

        self.optimizer = optimizer
        self.warm_restarts = CosineAnnealingWarmRestarts(
            self.optimizer, T_0, T_mult, eta_min)
        super().__init__(optimizer, last_epoch=last_epoch)

    def get_lr(self):
        """No calculation done here.
        """
        return self.get_last_lr()

    def step(self, epoch=None):
        """Computes a step for the learning rate scheduler.

        Here, a step is an epoch. This is where the learning rates are set
        and the last_epoch counter is updated.
        """
        # warm restarts
        if self.last_epoch < self.num_wr_epochs + 1:
            self.warm_restarts.step()
            self.last_epoch = self.warm_restarts.last_epoch
            self._last_lr = self.warm_restarts.get_last_lr()
        # constant LR (first round)
        elif (self.last_epoch >= self.mid_const_lr_epochs_range[0]
              and self.last_epoch < self.mid_const_lr_epochs_range[1]):
            self.last_epoch += 1
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.constant_lrs[0]
            self._last_lr = [
                group['lr'] for group in self.optimizer.param_groups
            ]
        # constant LR (second round)
        else:
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.constant_lrs[1]
            self.last_epoch += 1
            self._last_lr = [
                group['lr'] for group in self.optimizer.param_groups
            ]
Пример #14
0
conv_lr, layer_lr, head_lr = lrs
opt = torch.optim.AdamW(
    params=[
        # {'params': model.conv1.parameters(), 'lr': conv_lr},
        {
            'params': model.layer4.parameters(),
            'lr': layer_lr
        },
        {
            'params': model.last_linear.parameters(),
            'lr': head_lr
        }
    ],
    weight_decay=0.01)
sched = CosineAnnealingWarmRestarts(opt,
                                    T_0=len(loaders['train']),
                                    T_mult=2,
                                    eta_min=1e-6)
loss_fn = nn.CrossEntropyLoss()
runner = SupervisedRunner()

runner.train(model=model,
             num_epochs=epochs,
             criterion=loss_fn,
             optimizer=opt,
             scheduler=sched,
             logdir='/tmp/cells_split/',
             loaders=loaders,
             callbacks=[
                 AccuracyCallback(num_classes=num_classes),
                 BatchMetricsPlotCallback(use_env_creds=True),
                 EpochMetricsPlotCallback(use_env_creds=True)
Пример #15
0
def cos_lr_scheduler(optimizer, t_mult=5):
    return CosineAnnealingWarmRestarts(optimizer, t_mult)