def train_experiment(device, engine=None): with TemporaryDirectory() as logdir: # sample data num_samples, num_features, num_classes = int(1e4), int(1e1), 4 X = torch.rand(num_samples, num_features) y = (torch.rand(num_samples, ) * num_classes).to(torch.int64) # pytorch loaders dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, num_classes) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2]) # model training runner = dl.SupervisedRunner(input_key="features", output_key="logits", target_key="targets", loss_key="loss") callbacks = [ dl.AccuracyCallback(input_key="logits", target_key="targets", num_classes=num_classes), dl.PrecisionRecallF1SupportCallback(input_key="logits", target_key="targets", num_classes=4), dl.ConfusionMatrixCallback(input_key="logits", target_key="targets", num_classes=4), ] if engine is None or not isinstance( engine, (dl.AMPEngine, dl.DataParallelAMPEngine, dl.DistributedDataParallelAMPEngine)): callbacks.append( dl.AUCCallback(input_key="logits", target_key="targets")) runner.train( engine=engine or dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=1, valid_loader="valid", valid_metric="accuracy03", minimize_valid_metric=False, verbose=False, callbacks=callbacks, )
def test_resume_with_missing_file(): old_stdout = sys.stdout sys.stdout = str_stdout = StringIO() # experiment_setup logdir = "./logs/checkpoint_callback" checkpoint = logdir + "/checkpoints" logfile = checkpoint + "/_metrics.json" num_epochs = 5 # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = dl.SupervisedRunner() with pytest.raises(FileNotFoundError): runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=num_epochs, verbose=False, valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, callbacks=[ dl.CheckpointCallback( logdir=logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=2, load_on_stage_end={"model": "best", "criterion": "best", "optimizer": "last"}, resume="not_existing_file.pth", ), dl.CheckRunCallback(num_epoch_steps=num_epochs), ], ) sys.stdout = old_stdout exp_output = str_stdout.getvalue() shutil.rmtree(logdir, ignore_errors=True)
def objective(trial): lr = trial.suggest_loguniform("lr", 1e-3, 1e-1) num_hidden = int(trial.suggest_loguniform("num_hidden", 32, 128)) loaders = { "train": DataLoader( MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32, ), "valid": DataLoader( MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32, ), } model = nn.Sequential(nn.Flatten(), nn.Linear(784, num_hidden), nn.ReLU(), nn.Linear(num_hidden, 10)) optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() runner = dl.SupervisedRunner(input_key="features", output_key="logits", target_key="targets") runner.train( engine=engine or dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, callbacks={ "optuna": dl.OptunaPruningCallback(loader_key="valid", metric_key="accuracy01", minimize=False, trial=trial), "accuracy": dl.AccuracyCallback(input_key="logits", target_key="targets", num_classes=10), }, num_epochs=2, ) score = runner.callbacks["optuna"].best_score return score
def test_pruning_str_structured(): dataloader = prepare_experiment() model = nn.Linear(100, 10, bias=False) runner = dl.SupervisedRunner() criterion = nn.CrossEntropyLoss() runner.train( model=model, optimizer=torch.optim.Adam(model.parameters()), criterion=criterion, loaders={"train": dataloader}, callbacks=[PruningCallback("ln_structured", dim=1, l_norm=2)], num_epochs=1, ) assert np.isclose(pruning_factor(model), 0.5)
def test_batch_overfit(): loaders, model, criterion, optimizer, scheduler = _prepare_experiment() runner = dl.SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir="./logs/batch_overfit", num_epochs=1, verbose=False, callbacks=[dl.BatchOverfitCallback(train=1, valid=0.1)], ) assert runner.epoch_metrics["train"]["loss"] < 1.4 assert runner.epoch_metrics["valid"]["loss"] < 1.3
def train_model(self, config, trainset, sampler, cut_layer=None): # pylint: disable=unused-argument """A custom training loop. """ criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(self.model.parameters(), lr=0.02) train_loader = DataLoader(dataset=trainset, batch_size=config['batch_size'], sampler=sampler) # Training the model using Catalyst's SupervisedRunner runner = dl.SupervisedRunner() runner.train(model=self.model, criterion=criterion, optimizer=optimizer, loaders={"train": train_loader}, num_epochs=1, logdir="./logs", verbose=True)
def test_classification_pipeline(): """ Test if classification pipeline can run and compute metrics. In this test we check that BatchMetricCallback works with AccuracyMetric (ICallbackBatchMetric). """ x = torch.rand(NUM_SAMPLES, NUM_FEATURES) y = (torch.rand(NUM_SAMPLES) * NUM_CLASSES).long() dataset = TensorDataset(x, y) loader = DataLoader(dataset, batch_size=64, num_workers=1) model = DummyModel(num_features=NUM_FEATURES, num_classes=NUM_CLASSES) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = dl.SupervisedRunner(input_key="features", output_key="logits", target_key="targets") with TemporaryDirectory() as logdir: runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=OrderedDict({ "train": loader, "valid": loader }), logdir=logdir, num_epochs=3, verbose=False, valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, callbacks=OrderedDict({ "classification": dl.BatchMetricCallback( metric=AccuracyMetric(num_classes=NUM_CLASSES), input_key="logits", target_key="targets", ), }), ) assert "accuracy01" in runner.batch_metrics assert "accuracy01" in runner.loader_metrics
def train_experiment(device): with TemporaryDirectory() as logdir: # sample data num_samples, num_features, num_classes = int(1e4), int(1e1), 4 X = torch.rand(num_samples, num_features) y = (torch.rand(num_samples, num_classes) > 0.5).to(torch.float32) # pytorch loaders dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, num_classes) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2]) # model training runner = dl.SupervisedRunner(input_key="features", output_key="logits", target_key="targets", loss_key="loss") runner.train( engine=dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=1, valid_loader="valid", valid_metric="accuracy", minimize_valid_metric=False, verbose=False, callbacks=[ dl.AUCCallback(input_key="logits", target_key="targets"), dl.MultilabelAccuracyCallback(input_key="logits", target_key="targets", threshold=0.5), ], )
def objective(trial): lr = trial.suggest_loguniform("lr", 1e-3, 1e-1) optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() runner = dl.SupervisedRunner() runner.train( model=model, loaders=loaders, criterion=criterion, optimizer=optimizer, callbacks=[ OptunaCallback(trial), AccuracyCallback(num_classes=10), ], num_epochs=10, main_metric="accuracy01", minimize_metric=False, ) return runner.best_valid_metrics[runner.main_metric]
def test_aggregation_2(): """ Aggregation with custom function """ loaders, model, criterion, optimizer = prepare_experiment() runner = dl.SupervisedRunner() def aggregation_function(metrics, runner): epoch = runner.stage_epoch_step loss = (3 / 2 - epoch / 2) * metrics["loss_focal"] + (1 / 2 * epoch - 1 / 2) * metrics[ "loss_bce" ] return loss runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir="./logs/aggregation_2/", num_epochs=3, callbacks=[ dl.CriterionCallback( input_key="logits", target_key="targets", metric_key="loss_bce", criterion_key="bce", ), dl.CriterionCallback( input_key="logits", target_key="targets", metric_key="loss_focal", criterion_key="focal", ), # loss aggregation dl.MetricAggregationCallback(metric_key="loss", mode=aggregation_function), ], ) for loader in ["train", "valid"]: metrics = runner.epoch_metrics[loader] loss_1 = metrics["loss_bce"] loss_2 = metrics["loss"] assert np.abs(loss_1 - loss_2) < 1e-5
def test_aggregation_1(): """ Aggregation as weighted_sum """ loaders, model, criterion, optimizer = prepare_experiment() runner = dl.SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir="./logs/aggregation_1/", num_epochs=3, callbacks=[ dl.CriterionCallback( input_key="logits", target_key="targets", metric_key="loss_bce", criterion_key="bce", ), dl.CriterionCallback( input_key="logits", target_key="targets", metric_key="loss_focal", criterion_key="focal", ), # loss aggregation dl.MetricAggregationCallback( metric_key="loss", metrics={ "loss_focal": 0.6, "loss_bce": 0.4 }, mode="weighted_sum", ), ], ) for loader in ["train", "valid"]: metrics = runner.epoch_metrics[loader] loss_1 = metrics["loss_bce"] * 0.4 + metrics["loss_focal"] * 0.6 loss_2 = metrics["loss"] assert np.abs(loss_1 - loss_2) < 1e-5
def test_model(self, config, testset): # pylint: disable=unused-argument """A custom testing loop. """ test_loader = torch.utils.data.DataLoader( testset, batch_size=config['batch_size'], shuffle=False) # Using Catalyst's SupervisedRunner and AccuracyCallback to compute accuracies runner = dl.SupervisedRunner() runner.train(model=self.model, num_epochs=1, loaders={"valid": test_loader}, logdir="./logs", verbose=True, callbacks=[ dl.AccuracyCallback(input_key="logits", target_key="targets", num_classes=10) ]) # Retrieving the top-1 accuracy from SupervisedRunner accuracy = runner.epoch_metrics["valid"]["accuracy"] return accuracy
def test_parametrization(): dataloader = prepare_experiment() model = nn.Linear(100, 10, bias=False) runner = dl.SupervisedRunner() criterion = nn.CrossEntropyLoss() runner.train( model=model, optimizer=torch.optim.Adam(model.parameters()), criterion=criterion, loaders={"train": dataloader}, callbacks=[ PruningCallback(l1_unstructured, remove_reparametrization_on_stage_end=False) ], num_epochs=1, ) assert np.isclose(pruning_factor(model), 0.5) try: _mask = model.weight_mask mask_applied = True except AttributeError: mask_applied = False assert mask_applied
def objective(trial): lr = trial.suggest_loguniform("lr", 1e-3, 1e-1) optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() runner = dl.SupervisedRunner() runner.train( model=model, loaders=loaders, criterion=criterion, optimizer=optimizer, callbacks={ "optuna": OptunaPruningCallback( loader_key="valid", metric_key="loss", minimize=True, trial=trial ), "accuracy": AccuracyCallback( num_classes=10, input_key="logits", target_key="targets" ), }, num_epochs=2, valid_metric="accuracy01", minimize_valid_metric=False, ) return trial.best_score
def train(): num_features = 10 model = Projector(num_features) runner = dl.SupervisedRunner() runner.train( model=model, # loaders={"train": loader, "valid": loader}, datasets={ "batch_size": 32, "num_workers": 1, "get_datasets_fn": datasets_fn, "num_features": num_features, }, criterion=nn.MSELoss(), optimizer=optim.Adam(model.parameters()), logdir="logs/log_example_14", num_epochs=10, verbose=True, check=True, fp16=False, distributed=False, )
def test_batch_balance_class_sampler_with_prefetch(): train_data = MNIST(os.getcwd(), train=True, download=True, transform=ToTensor()) train_labels = train_data.targets.cpu().numpy().tolist() train_sampler = BatchBalanceClassSampler(train_labels, num_classes=10, num_samples=4) valid_data = MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()) loaders = { "train": DataLoader(train_data, batch_sampler=train_sampler), "valid": DataLoader(valid_data, batch_size=32), } loaders = {k: BatchPrefetchLoaderWrapper(v) for k, v in loaders.items()} model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.02) runner = dl.SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=1, logdir="./logs", valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, verbose=True, )
def test_pruning_callback() -> None: """Quantize model""" loaders = { "train": DataLoader( MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32, ), "valid": DataLoader( MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32, ), } model = nn.Sequential(Flatten(), nn.Linear(784, 512), nn.ReLU(), nn.Linear(512, 10)) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-2) runner = dl.SupervisedRunner() runner.train( model=model, callbacks=[dl.QuantizationCallback(logdir="./logs")], loaders=loaders, criterion=criterion, optimizer=optimizer, num_epochs=1, logdir="./logs", check=True, ) assert os.path.isfile("./logs/quantized.pth")
def train_experiment(device, engine=None): with TemporaryDirectory() as logdir: model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.02) loaders = { "train": DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32), "valid": DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32), } runner = dl.SupervisedRunner(input_key="features", output_key="logits", target_key="targets", loss_key="loss") callbacks = [ dl.AccuracyCallback(input_key="logits", target_key="targets", topk_args=(1, 3, 5)), dl.PrecisionRecallF1SupportCallback(input_key="logits", target_key="targets", num_classes=10), ] if SETTINGS.ml_required: callbacks.append( dl.ConfusionMatrixCallback(input_key="logits", target_key="targets", num_classes=10)) if SETTINGS.amp_required and (engine is None or not isinstance( engine, (dl.AMPEngine, dl.DataParallelAMPEngine, dl.DistributedDataParallelAMPEngine), )): callbacks.append( dl.AUCCallback(input_key="logits", target_key="targets")) if SETTINGS.onnx_required: callbacks.append( dl.OnnxCallback(logdir=logdir, input_key="features")) if SETTINGS.pruning_required: callbacks.append( dl.PruningCallback(pruning_fn="l1_unstructured", amount=0.5)) if SETTINGS.quantization_required: callbacks.append(dl.QuantizationCallback(logdir=logdir)) if engine is None or not isinstance(engine, dl.DistributedDataParallelEngine): callbacks.append( dl.TracingCallback(logdir=logdir, input_key="features")) # model training runner.train( engine=engine or dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=1, callbacks=callbacks, logdir=logdir, valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, verbose=False, load_best_on_end=True, timeit=False, check=False, overfit=False, fp16=False, ddp=False, ) # model inference for prediction in runner.predict_loader(loader=loaders["valid"]): assert prediction["logits"].detach().cpu().numpy().shape[-1] == 10 # model post-processing features_batch = next(iter(loaders["valid"]))[0] # model stochastic weight averaging model.load_state_dict( utils.get_averaged_weights_by_path_mask(logdir=logdir, path_mask="*.pth")) # model onnx export if SETTINGS.onnx_required: utils.onnx_export( model=runner.model, batch=runner.engine.sync_device(features_batch), file="./mnist.onnx", verbose=False, ) # model quantization if SETTINGS.quantization_required: utils.quantize_model(model=runner.model) # model pruning if SETTINGS.pruning_required: utils.prune_model(model=runner.model, pruning_fn="l1_unstructured", amount=0.8) # model tracing utils.trace_model(model=runner.model, batch=features_batch)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=6, shuffle=True) val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, num_workers=6, shuffle=False) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = val_loader runner = dl.SupervisedRunner(device=tu.device, input_key="image", input_target_key="label", output_key="logits") callbacks = [ CriterionCallback(input_key="label", output_key="logits", prefix="loss"), AccuracyCallback(input_key="label", output_key="logits", prefix="acc", activation="Sigmoid"), OptimizerCallback(accumulation_steps=2), #MixupCallback(alpha=0.3, input_key="label", output_key="logits", fields=("image", )) ] if TRAINING: runner.train(model=model,
def main(args): wandb.init(project="teacher-pruning", config=vars(args)) set_global_seed(42) # dataloader initialization transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_dataset = Wrp( datasets.CIFAR10(root=os.getcwd(), train=True, transform=transform_train, download=True)) valid_dataset = Wrp( datasets.CIFAR10(root=os.getcwd(), train=False, transform=transform_test)) train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=2) valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=128, num_workers=2) loaders = { "train": train_dataloader, "valid": valid_dataloader, } # model initialization model = PreActResNet18() model.fc = nn.Linear(512, 10) if args.teacher_model is not None: is_kd = True teacher_model = NAME2MODEL[args.teacher_model]() load_model_from_path(model=teacher_model, path=args.teacher_path) model = { "student": model, "teacher": teacher_model, } output_hiddens = args.beta is None is_kd_on_hiddens = output_hiddens runner = KDRunner(device=args.device, output_hiddens=output_hiddens) parameters = model["student"].parameters() else: is_kd = False runner = dl.SupervisedRunner(device=args.device) parameters = model.parameters() # optimizer optimizer_cls = NAME2OPTIM[args.optimizer] optimizer_kwargs = {"params": parameters, "lr": args.lr} if args.optimizer == "sgd": optimizer_kwargs["momentum"] = args.momentum else: optimizer_kwargs["betas"] = (args.beta1, args.beta2) optimizer = optimizer_cls(**optimizer_kwargs) scheduler = MultiStepLR(optimizer, milestones=[80, 120], gamma=args.gamma) logdir = f"logs/{wandb.run.name}" # callbacks callbacks = [dl.AccuracyCallback(num_classes=10), WandbCallback()] if is_kd: metrics = {} callbacks.append(dl.CriterionCallback(output_key="cls_loss")) callbacks.append(DiffOutputCallback()) coefs = get_loss_coefs(args.alpha, args.beta) metrics["cls_loss"] = coefs[0] metrics["diff_output_loss"] = coefs[1] if is_kd_on_hiddens: callbacks.append(DiffHiddenCallback()) metrics["diff_hidden_loss"] = coefs[2] aggregator_callback = dl.MetricAggregationCallback(prefix="loss", metrics=metrics, mode="weighted_sum") wrapped_agg_callback = dl.ControlFlowCallback(aggregator_callback, loaders=["train"]) callbacks.append(wrapped_agg_callback) runner.train( model=model, optimizer=optimizer, scheduler=scheduler, criterion=nn.CrossEntropyLoss(), loaders=loaders, callbacks=callbacks, num_epochs=args.epoch, logdir=logdir, verbose=True, )
def run_ml_pipeline(sampler_inbatch: data.IInbatchTripletSampler) -> float: """ Full metric learning pipeline, including train and val. This function is also used as minimal example in README.md, section name: 'CV - MNIST with Metric Learning'. Args: sampler_inbatch: sampler to forming triplets Returns: best metric value """ # 1. train and valid datasets dataset_root = "./data" transforms = t.Compose([t.ToTensor(), t.Normalize((0.1307, ), (0.3081, ))]) dataset_train = datasets.MnistMLDataset( root=dataset_root, train=True, download=True, transform=transforms, ) sampler = data.BalanceBatchSampler(labels=dataset_train.get_labels(), p=5, k=10) train_loader = DataLoader(dataset=dataset_train, sampler=sampler, batch_size=sampler.batch_size) dataset_val = datasets.MnistQGDataset(root=dataset_root, transform=transforms, gallery_fraq=0.2) val_loader = DataLoader(dataset=dataset_val, batch_size=1024) # 2. model and optimizer model = models.SimpleConv(features_dim=16) optimizer = Adam(model.parameters(), lr=0.0005) # 3. criterion with triplets sampling criterion = nn.TripletMarginLossWithSampler( margin=0.5, sampler_inbatch=sampler_inbatch) # 4. training with catalyst Runner callbacks = [ dl.ControlFlowCallback(dl.CriterionCallback(), loaders="train"), dl.ControlFlowCallback(dl.CMCScoreCallback(topk_args=[1]), loaders="valid"), dl.PeriodicLoaderCallback(valid=100), ] runner = dl.SupervisedRunner(device=utils.get_device()) runner.train( model=model, criterion=criterion, optimizer=optimizer, callbacks=callbacks, loaders={ "train": train_loader, "valid": val_loader }, minimize_metric=False, verbose=True, valid_loader="valid", num_epochs=100, main_metric="cmc01", ) return runner.best_valid_metrics["cmc01"]
item_num = len(train_dataset[0]) model = MultiDAE([200, 600, item_num], dropout=0.5) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) engine = dl.Engine() callbacks = [ dl.NDCGCallback("logits", "targets", [20, 50, 100]), dl.MAPCallback("logits", "targets", [20, 50, 100]), dl.MRRCallback("logits", "targets", [20, 50, 100]), dl.HitrateCallback("logits", "targets", [20, 50, 100]), dl.BackwardCallback("loss"), dl.OptimizerCallback("loss", accumulation_steps=1), ] runner = dl.SupervisedRunner(input_key="inputs", output_key="logits", target_key="targets", loss_key="loss") runner.train( model=model, optimizer=optimizer, criterion=criterion, engine=engine, loaders=loaders, num_epochs=100, verbose=True, timeit=False, callbacks=callbacks, logdir="./logs_multidae", )
def test_load_best_on_stage_end(): old_stdout = sys.stdout sys.stdout = str_stdout = StringIO() # experiment_setup logdir = "./logs/checkpoint_callback" checkpoint = logdir # + "/checkpoints" logfile = checkpoint + "/_metrics.json" # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = dl.SupervisedRunner() n_epochs = 5 # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=n_epochs, verbose=False, valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, callbacks=[ dl.CheckpointCallback( logdir=logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=2, load_on_stage_end="best", ), dl.CheckRunCallback(num_epoch_steps=n_epochs), ], ) sys.stdout = old_stdout exp_output = str_stdout.getvalue() assert len(re.findall(r"=> Loading", exp_output)) == 1 assert len(re.findall(r"=> Loading .*best\.pth", exp_output)) == 1 assert os.path.isfile(logfile) assert os.path.isfile(checkpoint + "/train.4.pth") assert os.path.isfile(checkpoint + "/train.4_full.pth") assert os.path.isfile(checkpoint + "/train.5.pth") assert os.path.isfile(checkpoint + "/train.5_full.pth") assert os.path.isfile(checkpoint + "/best.pth") assert os.path.isfile(checkpoint + "/best_full.pth") assert os.path.isfile(checkpoint + "/last.pth") assert os.path.isfile(checkpoint + "/last_full.pth") shutil.rmtree(logdir, ignore_errors=True)
callbacks = { "optimizer": dl.OptimizerCallback(metric_key="loss", accumulation_steps=1, grad_clip_params=None), "metric": dl.MetricCallback(scope='batch', input_key='y', output_key='preds', prefix='F1_token', metric_fn=ner_token_f1) } runner = dl.SupervisedRunner( input_key='features', output_key='preds', input_target_key="y", ) runner.train(model=model, criterion=loss, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, callbacks=callbacks, logdir='./checkpoints', num_epochs=100, main_metric='F1_token', minimize_metric=False, verbose=True)
def main(): cifar_train = CIFAR10('.', train=True, transform=transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor() ]), download=True) cifar_test = CIFAR10('.', train=False, transform=transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor() ]), download=True) dl_train = DataLoader(cifar_train, batch_size=16) dl_test = DataLoader(cifar_test, batch_size=16) logdir = "./logdir/Adam" num_epochs = 10 loaders = {'train': dl_train, 'valid': dl_test} model = resnet34() for name, param in model.named_parameters(): param.requires_grad = True model.train() criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = dl.SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=num_epochs, verbose=True, logdir=logdir, callbacks=[ logger.TensorboardLogger(), AccuracyCallback(num_classes=10) ], ) logdir = "./logdir/AdamW" model.apply(init_weights) optimizer = AdamW() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=num_epochs, verbose=True, logdir=logdir, callbacks=[ logger.TensorboardLogger(), AccuracyCallback(num_classes=10) ], ) logdir = "./logdir/RAdam" model.apply(init_weights) optimizer = RAdam() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=num_epochs, verbose=True, logdir=logdir, callbacks=[ logger.TensorboardLogger(), AccuracyCallback(num_classes=10) ], )
def train(dev_dir, logdir, device): if not config.joined_vocab: spm.SentencePieceTrainer.train(input=f'{dev_dir}/text', model_prefix=f'{dev_dir}/txt_bpe_ctx', model_type='bpe', vocab_size=config.src_vocab_size) spm.SentencePieceTrainer.train( input=f'{dev_dir}/cmd', model_prefix=f'{dev_dir}/cmd_bpe_ctx', model_type='bpe', vocab_size=config.tgt_vocab_size, ) text_tokenizer = spm.SentencePieceProcessor( f'{dev_dir}/txt_bpe_ctx.model') cmd_tokenizer = spm.SentencePieceProcessor( f'{dev_dir}/cmd_bpe_ctx.model') else: spm.SentencePieceTrainer.train( input=f'{dev_dir}/all', model_prefix=f'{dev_dir}/all_bpe_ctx', model_type='bpe', vocab_size=config.src_vocab_size, ) text_tokenizer = spm.SentencePieceProcessor( f'{dev_dir}/all_bpe_ctx.model') cmd_tokenizer = text_tokenizer train = pd.read_csv(f'{dev_dir}/train.csv', index_col=0) train = train.dropna() train['cmd_cleaned'] = train['cmd_cleaned'].apply( lambda cmd: cmd.replace('|', ' |')) train['util'] = train.cmd_cleaned.apply( lambda x: x.strip(' $()').split()[0]) train = train[train.util != ']'] train = train.reset_index(drop=True) mandf = pd.read_csv(f'{dev_dir}/man.csv', index_col=0) mandf['ctx'] = mandf.apply(make_ctx, axis=1) mandf = mandf.drop_duplicates(subset=('cmd')) mandf = mandf.set_index('cmd') train['ctx'] = train['util'].map(mandf.ctx) train.text_cleaned = train.text_cleaned + ' ' + train.ctx.fillna('') train['text_enc'] = train.text_cleaned.progress_apply( text_tokenizer.encode) train['cmd_enc'] = train.cmd_cleaned.progress_apply(cmd_tokenizer.encode) tdf = train[train.origin == 'original'] tdf2 = train[train.origin != 'original'] train, valid = train_test_split(tdf, test_size=500, random_state=SEED) train = pd.concat([train, tdf2]).reset_index(drop=True) train_ds = MtDataset(train.text_enc, train.cmd_enc, config, bos_id, eos_id, pad_id) valid_ds = MtDataset(valid.text_enc, valid.cmd_enc, config, bos_id, eos_id, pad_id) model = Transformer(config, pad_id) print('# params', sum(p.numel() for p in model.parameters() if p.requires_grad)) loaders = { 'train': data.DataLoader(train_ds, batch_size=config.batch_size, shuffle=True), 'valid': data.DataLoader(valid_ds, batch_size=config.batch_size), } criterion = nn.CrossEntropyLoss(ignore_index=pad_id) optimizer = torch.optim.Adam(model.parameters(), lr=config.optimizer_lr, weight_decay=config.weight_decay, amsgrad=True) callbacks = [ dl.CheckpointCallback(config.num_epochs), ] callbacks.append(dl.SchedulerCallback(mode="epoch")) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=config.plateau_factor, patience=3, cooldown=2, threshold=1e-3, min_lr=1e-6) shutil.rmtree(logdir, ignore_errors=True) os.makedirs(logdir, exist_ok=True) runner = dl.SupervisedRunner(device=device) runner.train( model=model, loaders=loaders, criterion=criterion, optimizer=optimizer, scheduler=scheduler if config.schedule else None, num_epochs=config.num_epochs, verbose=True, logdir=logdir, callbacks=callbacks, # check=True )
def train_experiment(device, engine=None): with TemporaryDirectory() as logdir: # sample data num_users, num_features, num_items = int(1e4), int(1e1), 10 X = torch.rand(num_users, num_features) y = (torch.rand(num_users, num_items) > 0.5).to(torch.float32) # pytorch loaders dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, num_items) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2]) callbacks = [ dl.BatchTransformCallback( input_key="logits", output_key="scores", transform=torch.sigmoid, scope="on_batch_end", ), dl.CriterionCallback(input_key="logits", target_key="targets", metric_key="loss"), dl.AUCCallback(input_key="scores", target_key="targets"), dl.HitrateCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.MRRCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.MAPCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.NDCGCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.OptimizerCallback(metric_key="loss"), dl.SchedulerCallback(), dl.CheckpointCallback(logdir=logdir, loader_key="valid", metric_key="map01", minimize=False), ] if SETTINGS.amp_required and (engine is None or not isinstance( engine, (dl.AMPEngine, dl.DataParallelAMPEngine, dl.DistributedDataParallelAMPEngine), )): callbacks.append( dl.AUCCallback(input_key="logits", target_key="targets")) # model training runner = dl.SupervisedRunner(input_key="features", output_key="logits", target_key="targets", loss_key="loss") runner.train( engine=engine or dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=1, verbose=False, callbacks=callbacks, )
def train(): load_to_mem_train = True features_stats = {"mean": 86, "std": 22} batch_size = 512 train_batch_size = batch_size validation_batch_size = train_batch_size path_to_train_data = "./data/external/train/" path_to_targets_train = "./data/external/train.csv" path_to_val_data = "./data/external/val/" path_to_targets_val = "./data/external/val.csv" path_to_save_model = "./models/model.pt" n_epochs = 2 es_rounds = 35 lr = 0.001 backbone_output_dim = 1024 backbone = VGGNet() model = Supervised1dModel(backbone=backbone, backbone_output_dim=backbone_output_dim, num_classes=3) n_workers = os.cpu_count() # define train, val and test datasets train_dataset = CustomDataset(path_to_data=path_to_train_data, files_to_use=None, load_data_to_mem=load_to_mem_train, path_to_targets=path_to_targets_train, features_stats=features_stats, mode="train") val_dataset = CustomDataset(path_to_data=path_to_val_data, files_to_use=None, load_data_to_mem=load_to_mem_train, path_to_targets=path_to_targets_val, features_stats=features_stats, mode="val") # define train, val and test loaders train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, collate_fn=collate_fn, num_workers=n_workers, shuffle=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=validation_batch_size, collate_fn=collate_fn, num_workers=n_workers, shuffle=False) # train runner = dl.SupervisedRunner() criterion = torch.nn.CrossEntropyLoss() callbacks = [ dl.F1ScoreCallback(), dl.EarlyStoppingCallback(patience=es_rounds, minimize=True) ] print("\n\n") print("Main training") optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=25, verbose=True) runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders={ "train": train_loader, "valid": val_loader }, num_epochs=n_epochs, callbacks=callbacks, logdir="./logdir/", load_best_on_end=True, main_metric="f1_score", minimize_metric=False, fp16=True, verbose=True) # save trained model torch.save(model, path_to_save_model)
# sample data num_samples, num_features, num_classes = int(1e4), int(1e1), 4 X = torch.rand(num_samples, num_features) y = (torch.rand(num_samples, ) * num_classes).to(torch.int64) # pytorch loaders dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, num_classes) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2]) # model training runner = dl.SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir="./logdir", num_epochs=3, check=True, callbacks=[dl.AccuracyCallback(num_classes=num_classes)], )
def train(dev_dir, logdir, device): train = pd.read_csv(f'{dev_dir}/train.csv', index_col=0) train['all_utils'] = train['cmd_cleaned'].apply(select_utils) train = train.loc[train.all_utils.apply(str.strip).apply(len) > 0] train['util'] = train['all_utils'].apply(lambda x: x.split()[0]) train = train.dropna().reset_index(drop=True) spm.SentencePieceTrainer.train(input=f'{dev_dir}/text', model_prefix=f'{dev_dir}/txt_bpe_clf', model_type='bpe', vocab_size=config.src_vocab_size) text_tokenizer = spm.SentencePieceProcessor(f'{dev_dir}/txt_bpe_clf.model') cmd_le = LabelEncoder() train['text_enc'] = train.text_cleaned.progress_apply( text_tokenizer.encode) train['y'] = cmd_le.fit_transform(train['util'].values) tdf = train[train.origin == 'original'] tdf2 = train[train.origin != 'original'] train, valid = train_test_split(tdf, test_size=500, random_state=SEED) train = pd.concat([train, tdf2]).reset_index(drop=True) train_ds = UtilDataset(train.text_enc, train.y, config, bos_id, eos_id, pad_id) valid_ds = UtilDataset(valid.text_enc, valid.y, config, bos_id, eos_id, pad_id) model = BertClassifier(config, pad_id, len(cmd_le.classes_)) print('# params', sum(p.numel() for p in model.parameters() if p.requires_grad)) loaders = { 'train': data.DataLoader(train_ds, batch_size=config.batch_size, shuffle=True), 'valid': data.DataLoader(valid_ds, batch_size=config.batch_size), } criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.optimizer_lr, weight_decay=config.weight_decay, amsgrad=True) callbacks = [ dl.CheckpointCallback(config.num_epochs), dl.AccuracyCallback(num_classes=len(cmd_le.classes_), topk_args=[1, 5]) ] if config.schedule == 'OneCycleLR': scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=config.optimizer_lr, epochs=config.num_epochs, steps_per_epoch=len(loaders['train'])) callbacks.append(dl.SchedulerCallback(mode="batch")) elif config.schedule == 'ReduceLROnPlateau': scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=config.plateau_factor, patience=5, cooldown=3, threshold=1e-3, min_lr=1e-6) callbacks.append(dl.SchedulerCallback(mode="epoch")) shutil.rmtree(logdir, ignore_errors=True) os.makedirs(logdir, exist_ok=True) runner = dl.SupervisedRunner(device=device) runner.train( model=model, loaders=loaders, criterion=criterion, optimizer=optimizer, scheduler=scheduler if config.schedule else None, num_epochs=config.num_epochs, verbose=True, logdir=logdir, callbacks=callbacks, ) joblib.dump(cmd_le, f'{dev_dir}/cmd_le')