def get_callbacks(self, stage: str): return { "criterion": dl.CriterionCallback(input_key="logits", target_key="labels", metric_key="loss"), "optimizer": dl.OptimizerCallback(metric_key="loss"), "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss", mode="batch"), "accuracy": dl.AccuracyCallback(input_key="logits", target_key="labels", topk_args=(1, )), "checkpoint": dl.CheckpointCallback( self._logdir, loader_key="valid", metric_key="accuracy", minimize=False, save_n_best=1, ), # "tqdm": dl.TqdmCallback(), }
def get_callbacks(self, stage: str): return { "optimizer": dl.OptimizerCallback(metric_key="loss"), "checkpoint": dl.CheckpointCallback( self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3 ), }
def load_weights(self, callbacks_list): """ Loads model weights and appends the CheckpointCallback if doing stateful model loading. This doesn't add the CheckpointCallback if it's 'model_only' loading bc SupervisedRunner adds it by default. """ ckpoint_params = self.cb_params["checkpoint_params"] # Having checkpoint_params=None is a hacky way to say no checkpoint # callback but eh what the heck if ckpoint_params["checkpoint_path"] != None: mode = ckpoint_params["mode"].lower() if mode == "full": print("Stateful loading...") ckpoint_p = Path(ckpoint_params["checkpoint_path"]) fname = ckpoint_p.name # everything in the path besides the base file name resume_dir = str(ckpoint_p.parents[0]) print(f"Loading {fname} from {resume_dir}. \ \nCheckpoints will also be saved in {resume_dir}.") # adding the checkpoint callback ckpoint = [ callbacks.CheckpointCallback(resume=fname, resume_dir=resume_dir) ] callbacks_list = callbacks_list + ckpoint elif mode == "model_only": print("Loading weights into model...") self.model = load_weights_train( ckpoint_params["checkpoint_path"], self.model) return callbacks_list
def train_experiment(device, engine=None): with TemporaryDirectory() as logdir: teacher = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) student = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) model = {"teacher": teacher, "student": student} criterion = {"cls": nn.CrossEntropyLoss(), "kl": nn.KLDivLoss(reduction="batchmean")} optimizer = optim.Adam(student.parameters(), lr=0.02) loaders = { "train": DataLoader( MNIST(os.getcwd(), train=True, download=True, transform=ToTensor()), batch_size=32 ), "valid": DataLoader( MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32 ), } runner = DistilRunner() # model training runner.train( engine=engine or dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=1, logdir=logdir, verbose=False, callbacks=[ dl.AccuracyCallback( input_key="t_logits", target_key="targets", num_classes=2, prefix="teacher_" ), dl.AccuracyCallback( input_key="s_logits", target_key="targets", num_classes=2, prefix="student_" ), dl.CriterionCallback( input_key="s_logits", target_key="targets", metric_key="cls_loss", criterion_key="cls", ), dl.CriterionCallback( input_key="s_logprobs", target_key="t_probs", metric_key="kl_div_loss", criterion_key="kl", ), dl.MetricAggregationCallback( metric_key="loss", metrics=["kl_div_loss", "cls_loss"], mode="mean" ), dl.OptimizerCallback(metric_key="loss", model_key="student"), dl.CheckpointCallback( logdir=logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3, ), ], )
def get_callbacks(self, stage: str): return { "criterion": dl.CriterionCallback(metric_key="loss", input_key="logits", target_key="targets"), "optimizer": dl.OptimizerCallback(metric_key="loss"), # "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "accuracy": dl.AccuracyCallback(input_key="logits", target_key="targets", topk_args=(1, 3, 5)), "classification": dl.PrecisionRecallF1SupportCallback(input_key="logits", target_key="targets", num_classes=10), "confusion_matrix": dl.ConfusionMatrixCallback(input_key="logits", target_key="targets", num_classes=10), "checkpoint": dl.CheckpointCallback(self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3), }
def get_callbacks(self): return { "criterion": dl.CriterionCallback(metric_key="loss", input_key="logits", target_key="targets"), "backward": dl.BackwardCallback(metric_key="loss"), "optimizer": dl.OptimizerCallback(metric_key="loss"), "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "accuracy": dl.AccuracyCallback(input_key="logits", target_key="targets", topk=(1, 3, 5)), "checkpoint": dl.CheckpointCallback( self._logdir, loader_key="valid", metric_key="accuracy01", minimize=False, topk=1, ), "tqdm": dl.TqdmCallback(), }
def get_callbacks(self, stage: str): callbacks = { "criterion": dl.CriterionCallback(metric_key="loss", input_key="logits", target_key="targets"), "optimizer": dl.OptimizerCallback( metric_key="loss", grad_clip_fn=nn.utils.clip_grad_norm_, grad_clip_params={"max_norm": 1.0}, ), # "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "accuracy": dl.AccuracyCallback(input_key="logits", target_key="targets", topk_args=(1, 3, 5)), "classification": dl.PrecisionRecallF1SupportCallback(input_key="logits", target_key="targets", num_classes=10), "checkpoint": dl.CheckpointCallback(self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3), } if SETTINGS.ml_required: callbacks["confusion_matrix"] = dl.ConfusionMatrixCallback( input_key="logits", target_key="targets", num_classes=10) return callbacks
def test_resume_with_missing_file(): old_stdout = sys.stdout sys.stdout = str_stdout = StringIO() # experiment_setup logdir = "./logs/checkpoint_callback" checkpoint = logdir + "/checkpoints" logfile = checkpoint + "/_metrics.json" num_epochs = 5 # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = dl.SupervisedRunner() with pytest.raises(FileNotFoundError): runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=num_epochs, verbose=False, valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, callbacks=[ dl.CheckpointCallback( logdir=logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=2, load_on_stage_end={ "model": "best", "criterion": "best", "optimizer": "last" }, resume="not_existing_file.pth", ), dl.CheckRunCallback(num_epoch_steps=num_epochs), ], ) sys.stdout = old_stdout exp_output = str_stdout.getvalue() shutil.rmtree(logdir, ignore_errors=True)
def test_load_best_on_stage_end(): old_stdout = sys.stdout sys.stdout = str_stdout = StringIO() # experiment_setup logdir = "./logs/checkpoint_callback" checkpoint = logdir + "/checkpoints" logfile = checkpoint + "/_metrics.json" # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = dl.SupervisedRunner() n_epochs = 5 # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=n_epochs, verbose=False, callbacks=[ dl.CheckpointCallback(save_n_best=2, load_on_stage_end="best"), dl.CheckRunCallback(num_epoch_steps=n_epochs), ], ) sys.stdout = old_stdout exp_output = str_stdout.getvalue() assert len(re.findall(r"=> Loading", exp_output)) == 1 assert len(re.findall(r"=> Loading .*best\.pth", exp_output)) == 1 assert os.path.isfile(logfile) assert os.path.isfile(checkpoint + "/train.4.pth") assert os.path.isfile(checkpoint + "/train.4_full.pth") assert os.path.isfile(checkpoint + "/train.5.pth") assert os.path.isfile(checkpoint + "/train.5_full.pth") assert os.path.isfile(checkpoint + "/best.pth") assert os.path.isfile(checkpoint + "/best_full.pth") assert os.path.isfile(checkpoint + "/last.pth") assert os.path.isfile(checkpoint + "/last_full.pth") shutil.rmtree(logdir, ignore_errors=True)
def get_callbacks(self, stage: str): callbacks = { "scores": dl.BatchTransformCallback( input_key="logits", output_key="scores", transform=partial(torch.softmax, dim=1), scope="on_batch_end", ), "labels": dl.BatchTransformCallback( input_key="scores", output_key="labels", transform=partial(torch.argmax, dim=1), scope="on_batch_end", ), "criterion": dl.CriterionCallback(metric_key="loss", input_key="logits", target_key="targets"), "optimizer": dl.OptimizerCallback( metric_key="loss", grad_clip_fn=nn.utils.clip_grad_norm_, grad_clip_params={"max_norm": 1.0}, ), # "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "accuracy": dl.AccuracyCallback(input_key="logits", target_key="targets", topk_args=(1, 3, 5)), "classification": dl.PrecisionRecallF1SupportCallback(input_key="logits", target_key="targets", num_classes=10), "checkpoint": dl.CheckpointCallback(self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3), } if SETTINGS.ml_required: callbacks["confusion_matrix"] = dl.ConfusionMatrixCallback( input_key="logits", target_key="targets", num_classes=10) callbacks["f1_score"] = dl.SklearnBatchCallback( keys={ "y_pred": "labels", "y_true": "targets" }, metric_fn="f1_score", metric_key="sk_f1", average="macro", zero_division=1, ) return callbacks
def train_experiment(device): with TemporaryDirectory() as logdir: # sample data num_users, num_features, num_items = int(1e4), int(1e1), 10 X = torch.rand(num_users, num_features) y = (torch.rand(num_users, num_items) > 0.5).to(torch.float32) # pytorch loaders dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, num_items) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2]) class CustomRunner(dl.Runner): def handle_batch(self, batch): x, y = batch logits = self.model(x) self.batch = { "features": x, "logits": logits, "scores": torch.sigmoid(logits), "targets": y, } # model training runner = CustomRunner() runner.train( engine=dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=1, verbose=False, callbacks=[ dl.CriterionCallback(input_key="logits", target_key="targets", metric_key="loss"), dl.AUCCallback(input_key="scores", target_key="targets"), dl.HitrateCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.MRRCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.MAPCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.NDCGCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.OptimizerCallback(metric_key="loss"), dl.SchedulerCallback(), dl.CheckpointCallback( logdir=logdir, loader_key="valid", metric_key="map01", minimize=False ), ], )
def get_callbacks(self, stage: str) -> Dict[str, dl.Callback]: return { "criterion": dl.CriterionCallback( metric_key="loss", input_key="logits", target_key="targets" ), "optimizer": dl.OptimizerCallback(metric_key="loss"), # "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "checkpoint": dl.CheckpointCallback( self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3 ), "check_freezed": CheckRequiresGrad("layer1", "train_freezed", False), "check_unfreezed": CheckRequiresGrad("layer1", "train_unfreezed", True), }
def train_experiment(device, engine=None): with TemporaryDirectory() as logdir: # sample data num_users, num_features, num_items = int(1e4), int(1e1), 10 X = torch.rand(num_users, num_features) y = (torch.rand(num_users, num_items) > 0.5).to(torch.float32) # pytorch loaders dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, num_items) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2]) callbacks = [ dl.CriterionCallback(input_key="logits", target_key="targets", metric_key="loss"), dl.AUCCallback(input_key="scores", target_key="targets"), dl.HitrateCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.MRRCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.MAPCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.NDCGCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)), dl.OptimizerCallback(metric_key="loss"), dl.SchedulerCallback(), dl.CheckpointCallback( logdir=logdir, loader_key="valid", metric_key="map01", minimize=False ), ] if engine is None or not isinstance( engine, (dl.AMPEngine, dl.DataParallelAMPEngine, dl.DistributedDataParallelAMPEngine) ): callbacks.append(dl.AUCCallback(input_key="logits", target_key="targets")) # model training runner = CustomRunner() runner.train( engine=engine or dl.DeviceEngine(device), model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=1, verbose=False, callbacks=callbacks, )
def get_callbacks(self): return { "backward": dl.BackwardCallback(metric_key="loss"), "optimizer": dl.OptimizerCallback(metric_key="loss"), "checkpoint": dl.CheckpointCallback( self._logdir, loader_key="valid", metric_key="loss", minimize=True, topk=3, ), }
def get_callbacks(self, stage: str): return { "criterion": dl.CriterionCallback( metric_key="loss", input_key="logits", target_key="targets" ), "optimizer": dl.OptimizerCallback(metric_key="loss"), "checkpoint": dl.CheckpointCallback( self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3, load_on_stage_start="best", ), "test_model_load": CheckModelStateLoadAfterStages("second", self._logdir, "best.pth"), }
def test_files_existence(tmpdir): logfile = tmpdir + "/model.storage.json" n_epochs = 5 callbacks = [ dl.CheckpointCallback( logdir=tmpdir, loader_key="valid", metric_key="loss", minimize=True, topk=2, ), dl.CheckRunCallback(num_epoch_steps=n_epochs), ] train_runner(tmpdir, n_epochs, callbacks) assert os.path.isfile(logfile) assert os.path.isfile(tmpdir + "/model.0004.pth") # assert os.path.isfile(tmpdir + "/train.4_full.pth") assert os.path.isfile(tmpdir + "/model.0005.pth") # assert os.path.isfile(tmpdir + "/train.5_full.pth") assert os.path.isfile(tmpdir + "/model.best.pth") # assert os.path.isfile(tmpdir + "/best_full.pth") assert os.path.isfile(tmpdir + "/model.last.pth")
def train_experiment(engine=None): with TemporaryDirectory() as logdir: # sample data num_users, num_features, num_items = int(1e4), int(1e1), 10 X = torch.rand(num_users, num_features) y = (torch.rand(num_users, num_items) > 0.5).to(torch.float32) # pytorch loaders dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, num_items) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2]) callbacks = [ dl.BatchTransformCallback( input_key="logits", output_key="scores", transform=torch.sigmoid, scope="on_batch_end", ), dl.CriterionCallback(input_key="logits", target_key="targets", metric_key="loss"), dl.HitrateCallback(input_key="scores", target_key="targets", topk=(1, 3, 5)), dl.MRRCallback(input_key="scores", target_key="targets", topk=(1, 3, 5)), dl.MAPCallback(input_key="scores", target_key="targets", topk=(1, 3, 5)), dl.NDCGCallback(input_key="scores", target_key="targets", topk=(1, 3)), dl.BackwardCallback(metric_key="loss"), dl.OptimizerCallback(metric_key="loss"), dl.SchedulerCallback(), dl.CheckpointCallback(logdir=logdir, loader_key="valid", metric_key="map01", minimize=False), ] if isinstance(engine, dl.CPUEngine): callbacks.append( dl.AUCCallback(input_key="logits", target_key="targets")) # model training runner = dl.SupervisedRunner( input_key="features", output_key="logits", target_key="targets", loss_key="loss", ) runner.train( engine=engine, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=1, verbose=False, callbacks=callbacks, )
def train(dev_dir, logdir, device): train = pd.read_csv(f'{dev_dir}/train.csv', index_col=0) train['all_utils'] = train['cmd_cleaned'].apply(select_utils) train = train.loc[train.all_utils.apply(str.strip).apply(len) > 0] train['util'] = train['all_utils'].apply(lambda x: x.split()[0]) train = train.dropna().reset_index(drop=True) spm.SentencePieceTrainer.train(input=f'{dev_dir}/text', model_prefix=f'{dev_dir}/txt_bpe_clf', model_type='bpe', vocab_size=config.src_vocab_size) text_tokenizer = spm.SentencePieceProcessor(f'{dev_dir}/txt_bpe_clf.model') cmd_le = LabelEncoder() train['text_enc'] = train.text_cleaned.progress_apply( text_tokenizer.encode) train['y'] = cmd_le.fit_transform(train['util'].values) tdf = train[train.origin == 'original'] tdf2 = train[train.origin != 'original'] train, valid = train_test_split(tdf, test_size=500, random_state=SEED) train = pd.concat([train, tdf2]).reset_index(drop=True) train_ds = UtilDataset(train.text_enc, train.y, config, bos_id, eos_id, pad_id) valid_ds = UtilDataset(valid.text_enc, valid.y, config, bos_id, eos_id, pad_id) model = BertClassifier(config, pad_id, len(cmd_le.classes_)) print('# params', sum(p.numel() for p in model.parameters() if p.requires_grad)) loaders = { 'train': data.DataLoader(train_ds, batch_size=config.batch_size, shuffle=True), 'valid': data.DataLoader(valid_ds, batch_size=config.batch_size), } criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.optimizer_lr, weight_decay=config.weight_decay, amsgrad=True) callbacks = [ dl.CheckpointCallback(config.num_epochs), dl.AccuracyCallback(num_classes=len(cmd_le.classes_), topk_args=[1, 5]) ] if config.schedule == 'OneCycleLR': scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=config.optimizer_lr, epochs=config.num_epochs, steps_per_epoch=len(loaders['train'])) callbacks.append(dl.SchedulerCallback(mode="batch")) elif config.schedule == 'ReduceLROnPlateau': scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=config.plateau_factor, patience=5, cooldown=3, threshold=1e-3, min_lr=1e-6) callbacks.append(dl.SchedulerCallback(mode="epoch")) shutil.rmtree(logdir, ignore_errors=True) os.makedirs(logdir, exist_ok=True) runner = dl.SupervisedRunner(device=device) runner.train( model=model, loaders=loaders, criterion=criterion, optimizer=optimizer, scheduler=scheduler if config.schedule else None, num_epochs=config.num_epochs, verbose=True, logdir=logdir, callbacks=callbacks, ) joblib.dump(cmd_le, f'{dev_dir}/cmd_le')
def train_experiment(device): with TemporaryDirectory() as logdir: # sample data num_samples, num_features, num_classes1, num_classes2 = int(1e4), int( 1e1), 4, 10 X = torch.rand(num_samples, num_features) y1 = (torch.rand(num_samples, ) * num_classes1).to(torch.int64) y2 = (torch.rand(num_samples, ) * num_classes2).to(torch.int64) # pytorch loaders dataset = TensorDataset(X, y1, y2) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} class CustomModule(nn.Module): def __init__(self, in_features: int, out_features1: int, out_features2: int): super().__init__() self.shared = nn.Linear(in_features, 128) self.head1 = nn.Linear(128, out_features1) self.head2 = nn.Linear(128, out_features2) def forward(self, x): x = self.shared(x) y1 = self.head1(x) y2 = self.head2(x) return y1, y2 # model, criterion, optimizer, scheduler model = CustomModule(num_features, num_classes1, num_classes2) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2]) class CustomRunner(dl.Runner): def handle_batch(self, batch): x, y1, y2 = batch y1_hat, y2_hat = self.model(x) self.batch = { "features": x, "logits1": y1_hat, "logits2": y2_hat, "targets1": y1, "targets2": y2, } # model training runner = CustomRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=1, verbose=False, callbacks=[ dl.CriterionCallback(metric_key="loss1", input_key="logits1", target_key="targets1"), dl.CriterionCallback(metric_key="loss2", input_key="logits2", target_key="targets2"), dl.MetricAggregationCallback(prefix="loss", metrics=["loss1", "loss2"], mode="mean"), dl.OptimizerCallback(metric_key="loss"), dl.SchedulerCallback(), dl.AccuracyCallback( input_key="logits1", target_key="targets1", num_classes=num_classes1, prefix="one_", ), dl.AccuracyCallback( input_key="logits2", target_key="targets2", num_classes=num_classes2, prefix="two_", ), dl.ConfusionMatrixCallback( input_key="logits1", target_key="targets1", num_classes=num_classes1, prefix="one_cm", ), # catalyst[ml] required dl.ConfusionMatrixCallback( input_key="logits2", target_key="targets2", num_classes=num_classes2, prefix="two_cm", ), # catalyst[ml] required dl.CheckpointCallback( "./logs/one", loader_key="valid", metric_key="one_accuracy", minimize=False, save_n_best=1, ), dl.CheckpointCallback( "./logs/two", loader_key="valid", metric_key="two_accuracy03", minimize=False, save_n_best=3, ), ], loggers={ "console": dl.ConsoleLogger(), "tb": dl.TensorboardLogger("./logs/tb") }, )
def train_experiment(engine=None): with TemporaryDirectory() as logdir: # sample data num_samples, num_features, num_classes1, num_classes2 = int(1e4), int( 1e1), 4, 10 X = torch.rand(num_samples, num_features) y1 = (torch.rand(num_samples) * num_classes1).to(torch.int64) y2 = (torch.rand(num_samples) * num_classes2).to(torch.int64) # pytorch loaders dataset = TensorDataset(X, y1, y2) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = CustomModule(num_features, num_classes1, num_classes2) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2]) callbacks = [ dl.CriterionCallback(metric_key="loss1", input_key="logits1", target_key="targets1"), dl.CriterionCallback(metric_key="loss2", input_key="logits2", target_key="targets2"), dl.MetricAggregationCallback(metric_key="loss", metrics=["loss1", "loss2"], mode="mean"), dl.BackwardCallback(metric_key="loss"), dl.OptimizerCallback(metric_key="loss"), dl.SchedulerCallback(), dl.AccuracyCallback( input_key="logits1", target_key="targets1", num_classes=num_classes1, prefix="one_", ), dl.AccuracyCallback( input_key="logits2", target_key="targets2", num_classes=num_classes2, prefix="two_", ), dl.CheckpointCallback( "./logs/one", loader_key="valid", metric_key="one_accuracy01", minimize=False, topk=1, ), dl.CheckpointCallback( "./logs/two", loader_key="valid", metric_key="two_accuracy03", minimize=False, topk=3, ), ] if SETTINGS.ml_required: # catalyst[ml] required callbacks.append( dl.ConfusionMatrixCallback( input_key="logits1", target_key="targets1", num_classes=num_classes1, prefix="one_cm", )) # catalyst[ml] required callbacks.append( dl.ConfusionMatrixCallback( input_key="logits2", target_key="targets2", num_classes=num_classes2, prefix="two_cm", )) # model training runner = CustomRunner() runner.train( engine=engine, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, num_epochs=1, verbose=False, callbacks=callbacks, loggers={ "console": dl.ConsoleLogger(), "tb": dl.TensorboardLogger("./logs/tb"), }, )
def train_experiment(device): with TemporaryDirectory() as logdir: teacher = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) student = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) criterion = { "cls": nn.CrossEntropyLoss(), "kl": nn.KLDivLoss(reduction="batchmean") } optimizer = optim.Adam(student.parameters(), lr=0.02) loaders = { "train": DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=ToTensor()), batch_size=32), "valid": DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()), batch_size=32), } class DistilRunner(dl.Runner): def handle_batch(self, batch): x, y = batch teacher.eval() # let's manually set teacher model to eval mode with torch.no_grad(): t_logits = self.model["teacher"](x) s_logits = self.model["student"](x) self.batch = { "t_logits": t_logits, "s_logits": s_logits, "targets": y, "s_logprobs": F.log_softmax(s_logits, dim=-1), "t_probs": F.softmax(t_logits, dim=-1), } runner = DistilRunner() # model training runner.train( engine=dl.DeviceEngine(device), model={ "teacher": teacher, "student": student }, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=1, logdir=logdir, verbose=True, callbacks=[ dl.AccuracyCallback(input_key="t_logits", target_key="targets", num_classes=2, prefix="teacher_"), dl.AccuracyCallback(input_key="s_logits", target_key="targets", num_classes=2, prefix="student_"), dl.CriterionCallback( input_key="s_logits", target_key="targets", metric_key="cls_loss", criterion_key="cls", ), dl.CriterionCallback( input_key="s_logprobs", target_key="t_probs", metric_key="kl_div_loss", criterion_key="kl", ), dl.MetricAggregationCallback( prefix="loss", metrics=["kl_div_loss", "cls_loss"], mode="mean"), dl.OptimizerCallback(metric_key="loss", model_key="student"), dl.CheckpointCallback( logdir=logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3, ), ], )
def train(dev_dir, logdir, device): if not config.joined_vocab: spm.SentencePieceTrainer.train(input=f'{dev_dir}/text', model_prefix=f'{dev_dir}/txt_bpe_ctx', model_type='bpe', vocab_size=config.src_vocab_size) spm.SentencePieceTrainer.train( input=f'{dev_dir}/cmd', model_prefix=f'{dev_dir}/cmd_bpe_ctx', model_type='bpe', vocab_size=config.tgt_vocab_size, ) text_tokenizer = spm.SentencePieceProcessor( f'{dev_dir}/txt_bpe_ctx.model') cmd_tokenizer = spm.SentencePieceProcessor( f'{dev_dir}/cmd_bpe_ctx.model') else: spm.SentencePieceTrainer.train( input=f'{dev_dir}/all', model_prefix=f'{dev_dir}/all_bpe_ctx', model_type='bpe', vocab_size=config.src_vocab_size, ) text_tokenizer = spm.SentencePieceProcessor( f'{dev_dir}/all_bpe_ctx.model') cmd_tokenizer = text_tokenizer train = pd.read_csv(f'{dev_dir}/train.csv', index_col=0) train = train.dropna() train['cmd_cleaned'] = train['cmd_cleaned'].apply( lambda cmd: cmd.replace('|', ' |')) train['util'] = train.cmd_cleaned.apply( lambda x: x.strip(' $()').split()[0]) train = train[train.util != ']'] train = train.reset_index(drop=True) mandf = pd.read_csv(f'{dev_dir}/man.csv', index_col=0) mandf['ctx'] = mandf.apply(make_ctx, axis=1) mandf = mandf.drop_duplicates(subset=('cmd')) mandf = mandf.set_index('cmd') train['ctx'] = train['util'].map(mandf.ctx) train.text_cleaned = train.text_cleaned + ' ' + train.ctx.fillna('') train['text_enc'] = train.text_cleaned.progress_apply( text_tokenizer.encode) train['cmd_enc'] = train.cmd_cleaned.progress_apply(cmd_tokenizer.encode) tdf = train[train.origin == 'original'] tdf2 = train[train.origin != 'original'] train, valid = train_test_split(tdf, test_size=500, random_state=SEED) train = pd.concat([train, tdf2]).reset_index(drop=True) train_ds = MtDataset(train.text_enc, train.cmd_enc, config, bos_id, eos_id, pad_id) valid_ds = MtDataset(valid.text_enc, valid.cmd_enc, config, bos_id, eos_id, pad_id) model = Transformer(config, pad_id) print('# params', sum(p.numel() for p in model.parameters() if p.requires_grad)) loaders = { 'train': data.DataLoader(train_ds, batch_size=config.batch_size, shuffle=True), 'valid': data.DataLoader(valid_ds, batch_size=config.batch_size), } criterion = nn.CrossEntropyLoss(ignore_index=pad_id) optimizer = torch.optim.Adam(model.parameters(), lr=config.optimizer_lr, weight_decay=config.weight_decay, amsgrad=True) callbacks = [ dl.CheckpointCallback(config.num_epochs), ] callbacks.append(dl.SchedulerCallback(mode="epoch")) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=config.plateau_factor, patience=3, cooldown=2, threshold=1e-3, min_lr=1e-6) shutil.rmtree(logdir, ignore_errors=True) os.makedirs(logdir, exist_ok=True) runner = dl.SupervisedRunner(device=device) runner.train( model=model, loaders=loaders, criterion=criterion, optimizer=optimizer, scheduler=scheduler if config.schedule else None, num_epochs=config.num_epochs, verbose=True, logdir=logdir, callbacks=callbacks, # check=True )