def train_tune(hparams, rdm): model = get_model(hparams) logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".", default_hp_metric=False) logger.log_hyperparams( hparams, { 'train_acc': 0, 'train_f1': 0, 'train_loss': 0, 'valid_acc': 0, 'valid_f1': 0, 'valid_loss': 0, }) trainer = pl.Trainer(max_epochs=hparams['n_epochs'], gpus=1, logger=logger, progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback( ['valid_acc', 'valid_f1', 'valid_loss'], on="validation_end") ]) trainer.fit(model, rdm)
def train_model(config=None): trainer = pl.Trainer( default_root_dir="/home/nuoc/Documents/MEX/src/version_0.2/checkpoints", gpus=1, precision=16, min_epochs=20, max_epochs=MAX_EPOCHS, callbacks=[TuneReportCallback({"loss": "avg_val_loss", }, on="validation_end")], logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"), stochastic_weight_avg=True ) feature_dims2 = { "phase_dim": phase_dim, "pose_dim": pose_dim, "cost_dim": cost_dim, "g_input_dim": config["k"] + config["cost_hidden_dim"], "g_output_dim": phase_dim + config["k"] + cost_dim } in_slice = [phase_dim, pose_dim, cost_dim] out_slice = [phase_dim, config["k"], cost_dim] pose_encoder = MLP(config=config, dimensions=[pose_dim]) model = MotionGenerationModel(config=config, Model=MoE, pose_autoencoder=pose_encoder, feature_dims=feature_dims2, input_slicers=in_slice, output_slicers=out_slice, train_set=train_set, val_set=val_set, test_set=test_set, name=model_name ) trainer.fit(model)
def train_model(config, gpus, w2v, num_epochs=10): early_stop_callback = EarlyStopping(monitor="val_Accuracy", min_delta=0.0, patience=5, verbose=True, mode="min") checkpoint_callback = ModelCheckpoint( "models/wav2vec_kws/", save_top_k=1, verbose=True, monitor='val_Accuracy', mode='min', ) tune_callback = TuneReportCallback({"acc": "val_Accuracy"}, on="validation_end") logger = TensorBoardLogger("tb_logs", name="wav2vec_kws_tune") mlf_logger = MLFlowLogger(experiment_name="wav2vec_kws", tracking_uri="http://192.168.0.32") mlflow.pytorch.autolog() trainer = pl.Trainer( gpus=gpus, callbacks=[checkpoint_callback, early_stop_callback, tune_callback], logger=[logger, mlf_logger], accumulate_grad_batches=4, amp_level="O0", max_epochs=num_epochs, progress_bar_refresh_rate=1, log_every_n_steps=1, flush_logs_every_n_steps=1) config = argparse.Namespace(**config) model = Wav2VecKWS(config, w2v) trainer.fit(model)
def myTrain(config,num_epochs,sym01,sym02,period): p = dict( seq_len = config['seq_len'], batch_size = config['batch_size'], criterion = nn.MSELoss(), max_epochs = num_epochs, n_features = 3, hidden_size = config['hidden_size'], num_layers = config['num_layers'], dropout = config['dropout'], learning_rate = config['lr'] ) print("myTrain parameters:",sym01,sym02,period) seed_everything(1) csv_logger = CSVLogger('./', name='lstm', version='0'), metrics = {"loss": "ptl/val_loss"} trainer = Trainer( max_epochs=p['max_epochs'], logger=csv_logger, callbacks=[TuneReportCallback(metrics, on="validation_end")] #gpus=1, #row_log_interval=1, #progress_bar_refresh_rate=2, ) model = LSTMRegressor( n_features = p['n_features'], hidden_size = p['hidden_size'], seq_len = p['seq_len'], batch_size = p['batch_size'], criterion = p['criterion'], num_layers = p['num_layers'], dropout = p['dropout'], learning_rate = p['learning_rate'] ) dm = MyDataModule( sym01=sym01, sym02=sym02, period=period, seq_len = p['seq_len'], batch_size = p['batch_size'] ) dm.reset( sym01=sym01, sym02=sym02, period=period, seq_len = p['seq_len'], batch_size = p['batch_size'] ) dm.setup('test') trainer.fit(model, dm) testresult = trainer.test(model, datamodule=dm) trainer.save_checkpoint(sym01+"-lstm.ckpt") print(testresult) print(testresult) testresult = testresult[0] print(testresult['val_loss']) return model,testresult['val_loss']
def tune_main(hparams, num_epochs=15, num_gpus=0): print(hparams) mean, std, traindir, valdir, num_classes = choose_dataset('cifar10') traindir = '/home/jovyan/work/cv_data/cifar10/train' valdir = '/home/jovyan/work/cv_data/cifar10/test' hparams['num_classes'] = num_classes train_logger.info('Training Directory: {0}'.format(traindir) ) model = LightningModel(hparams) trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, #distributed_backend=hparams.distributed_backend, precision=32, #early_stop_callback=early_stop_callback, logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback( { "loss": "val_loss_epoch", "accuracy": "val_acc_epoch" }, on="validation_end" ) ]) normal_pipe = BasicPipe(hparams, traindir, valdir, mean, std) trainer.fit(model, normal_pipe)
def tunerun(config): RATIO = config['ratio'] DAT_RAT, DAT_TOT = config['thresholds'] TRAIN_BATCH_SIZE = config['train_batch_size'] VAL_BATCH_SIZE = config['val_batch_size'] TEST_BATCH_SIZE = config['test_batch_size'] LEARNING_RATE = config['lr'] MAX_EPOCHS = 30 WEIGHT_DECAY = config['decay'] #DATA_PATH = "/bigtemp/rm5tx/nlp_project/data_cache/" DATA_PATH = os.path.expanduser("~/data_cache/") # DATA_ADJACENT_PATH = os.path.expanduser("~/data_adjacent_cache/") MAX_LEN = config['max_len'] ADJACENT = True data = ProjData(max_len=MAX_LEN, ratio=RATIO, adjacent=ADJACENT, adjrat=DAT_RAT, adjtot=DAT_TOT) if ADJACENT: MODEL_NAME = 'nlp_proj_adjacent' + str(MAX_LEN) else: MODEL_NAME = 'nlp_proj_norm' + str(MAX_LEN) try: data.load(DATA_PATH) print("Loaded Saved Data") except Exception as e: print(e) data.setup() data.save(DATA_PATH) model = ProjModel(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY) # logger = TensorBoardLogger(os.path.expanduser("~/tb_logs"), name=MODEL_NAME)# # checkpoint_callback = ModelCheckpoint(monitor='valid_loss', # dirpath=os.path.expanduser("~/saved_models"), # save_last=True, # filename=MODEL_NAME + '-{epoch:02d}-{val_loss:.2f}',) tune = TuneReportCallback({"avg_acc": "avg_acc"}, on="validation_end") # earlystopping = EarlyStopping(monitor='avg_acc', verbose=True, patience=0) trainer = pl.Trainer( progress_bar_refresh_rate=100, # logger=logger, accelerator='ddp', # jupyter can't use ddp, use dp instead # effective batch size is batch_size * num_gpus * num_nodes gpus=1, gradient_clip_val=1.0, max_epochs=MAX_EPOCHS, fast_dev_run=False, callbacks=[tune]) trainer.fit(model, data.train_dataloader(batch_size=TRAIN_BATCH_SIZE), data.val_dataloader(batch_size=VAL_BATCH_SIZE))
def train(config): module = _MockModule(10, 20) trainer = pl.Trainer(max_epochs=1, callbacks=[ TuneReportCallback( {"tune_loss": "avg_val_loss"}, on="validation_end") ]) trainer.fit(module)
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0): model = LightningMNISTClassifier(config, data_dir) dm = MNISTDataModule( data_dir=data_dir, num_workers=1, batch_size=config["batch_size"]) metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"} trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, progress_bar_refresh_rate=0, callbacks=[TuneReportCallback(metrics, on="validation_end")]) trainer.fit(model, dm)
def train_TFT_tune(config, num_epochs=num_epochs): m = TFT(**config, epochs=num_epochs) train_loader, val_loader, model = m._hyperparameter_optimization(df, freq) trainer = pl.Trainer( max_epochs=num_epochs, progress_bar_refresh_rate=0, num_sanity_val_steps=0, callbacks=TuneReportCallback({"loss": "val_loss"}, on="validation_end"), checkpoint_callback=False, logger=False, ) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)
def train_model(config=None): trainer = pl.Trainer( default_root_dir="/home/nuoc/Documents/MEX/src/version_0.2/checkpoints", gpus=1, precision=16, min_epochs=20, max_epochs=MAX_EPOCHS, callbacks=[TuneReportCallback({"loss": "avg_val_loss", }, on="validation_end")], logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"), stochastic_weight_avg=True ) model = VAE(config=config, input_dims=[pose_dim], name=model_name, train_set=train_set, val_set=val_set, test_set=test_set) trainer.fit(model)
def train_mnist_tune(config, num_epochs=10, num_gpus=0): data_dir = os.path.abspath("./data") model = LightningMNISTClassifier(config, data_dir) with FileLock(os.path.expanduser("~/.data.lock")): dm = MNISTDataModule(data_dir=data_dir, num_workers=1, batch_size=config["batch_size"]) metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"} trainer = pl.Trainer( max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), progress_bar_refresh_rate=0, callbacks=[TuneReportCallback(metrics, on="validation_end")]) trainer.fit(model, dm)
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0): model = LightningMNISTClassifier(config, data_dir) trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback( { "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, on="validation_end") ]) trainer.fit(model)
def fit(model=None, model_name="model", num_epochs=300, num_gpus=1): trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"), progress_bar_refresh_rate=20, callbacks=[ TuneReportCallback({ "loss": "avg_val_loss", }, on="validation_end"), EarlyStopping(monitor="avg_val_loss") ], precision=16, ) trainer.fit(model)
def tuning(config=None, MODEL=None, pose_autoencoder=None, cost_dim=None, phase_dim=None, input_slices=None, output_slices=None, train_set=None, val_set=None, num_epochs=300, model_name="model"): trainer = pl.Trainer( max_epochs=num_epochs, gpus=1, logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"), progress_bar_refresh_rate=5, callbacks=[ TuneReportCallback({"loss": "avg_val_loss", }, on="validation_end"), EarlyStopping(monitor="avg_val_loss") ], ) model = MODEL(config=config, pose_autoencoder=pose_autoencoder, cost_input_dimension=cost_dim, phase_dim=phase_dim, input_slicers=input_slices, output_slicers=output_slices, train_set=train_set, val_set=val_set, name=model_name) trainer.fit(model)
def train_model_tune(config, data_dir=None, num_epochs=10, num_gpus=1): model = CNN(config) image_module = MyImageModule(dataset_size=100, batch_size=32) image_module.setup() logger = TensorBoardLogger('tb_logs', name='Model_prueba_tuning') trainer = pl.Trainer(max_epochs=num_epochs, gpus=num_gpus, logger=logger, deterministic=True, callbacks=[ TuneReportCallback( { "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, on="validation_end") ]) trainer.fit(model, datamodule=image_module)
def train_mnist_tune(config, num_epochs=10, num_gpus=0, data_dir="~/data"): data_dir = os.path.expanduser(data_dir) model = LightningMNISTClassifier(config, data_dir) trainer = pl.Trainer( max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), logger=TensorBoardLogger(save_dir=os.getcwd(), name="", version="."), enable_progress_bar=False, callbacks=[ TuneReportCallback( { "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, on="validation_end") ]) trainer.fit(model)
def _default_trainable(config, checkpoint_dir=None): """A default trainable function used by `tune.run` It performs the most straight forward training loop with the provided `config`: - Create the pipeline (optionally with a provided vocab) - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch - Execute the training """ if config["silence"]: logging.getLogger("biome.text").setLevel(logging.ERROR) pipeline = Pipeline.from_config(config["pipeline_config"]) trainer_config = TrainerConfiguration(**config["trainer_config"]) vocab_config = config["vocab_config"] if vocab_config: vocab_config = VocabularyConfiguration(**vocab_config) callbacks = trainer_config.callbacks if not isinstance(callbacks, list): callbacks = [callbacks] if not any( [isinstance(callback, TuneReportCallback) for callback in callbacks] ): tune_callback = TuneReportCallback(metrics=config["metrics"]) if trainer_config.callbacks is None: trainer_config.callbacks = tune_callback else: trainer_config.callbacks = callbacks + [tune_callback] train_ds = Dataset.load_from_disk(config["train_dataset_path"]) valid_ds = Dataset.load_from_disk(config["valid_dataset_path"]) train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True) valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True) trainer = Trainer( pipeline=pipeline, train_dataset=train_instances, valid_dataset=valid_instances, trainer_config=trainer_config, vocab_config=vocab_config, ) trainer.fit()
def train_mnist_tune(tuning_config, data_dir=None, num_epochs=10, num_gpus=0): # Only Training model = LightningMNISTClassifier(tuning_config, data_dir) # =============================================================================== # Callback # ===============================================================================\ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.callbacks import EarlyStopping early_stop_cb = EarlyStopping(monitor='ptl/val_loss', patience=5, verbose=True, mode='min') ckpt_cb = ModelCheckpoint(tune.get_trial_dir() + '/checkpoints', save_top_k=5, verbose=True, monitor='ptl/val_loss', mode='min', save_last=True, filename='model_{epoch:03d}-{step}') tune_rp_cb = TuneReportCallback( { "val_loss": "ptl/val_loss", "val_accuracy": "ptl/val_accuracy" }, on="validation_end") # =============================================================================== # Trainer # Note: Must set logger as default with # =============================================================================== trainer = pl.Trainer( progress_bar_refresh_rate=0, # 0 means no print progress max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), callbacks=[ckpt_cb, tune_rp_cb, early_stop_cb]) trainer.logger._default_hp_metric = False # hp_metrc must be False trainer.fit(model)
def tune_train(tune_config: dict, base_arg): """ Wrapper for ray.tune to adjust necessary params """ # note tune config arg = deepcopy(base_arg) arg.learning_rate = tune_config['learning_rate'] arg.weight_decay = tune_config['weight_decay'] arg.batch_size = tune_config['batch_size'] arg.latent_dim = tune_config['latent_dim'] arg.does_ray_tuning = True cfg = Config(arg) main(cfg, callbacks=[ TuneReportCallback(metrics=[ "val_lossR", ], on="validation_end") ])
def train_lesion(config, data_dir=None, num_epochs=10, num_gpus=1): model = LesionModel(config) data_module = LesionDataModule(config) logger = TensorBoardLogger('tb_logs', name='my_model') metrics = {"iou": "ptl/val_iou"} checkpoint_callback = ModelCheckpoint( verbose=True, monitor='ptl/val_iou', mode='max', ## for val_loss this should be min dirpath='check_point_path/', filename='lesion-{epoch:02d}-{val_iou:.2f}') trainer = pl.Trainer(max_epochs=num_epochs, gpus=num_gpus, progress_bar_refresh_rate=50, logger=logger, automatic_optimization=True, callbacks=[ TuneReportCallback(metrics, on="validation_end"), checkpoint_callback ]) trainer.fit(model, data_module)
def tune_mnist(data_dir, num_samples=10, num_epochs=10, num_hosts=1, num_slots=4, use_gpu=False): config = { "layer_1": tune.choice([32, 64, 128]), "layer_2": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } # Add Tune callback. metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"} callbacks = [TuneReportCallback(metrics, on="validation_end")] trainable = tune.with_parameters(train_mnist, data_dir=data_dir, num_epochs=num_epochs, num_hosts=num_hosts, num_slots=num_slots, use_gpu=use_gpu, callbacks=callbacks) analysis = tune.run( trainable, metric="loss", mode="min", config=config, num_samples=num_samples, resources_per_trial={ "cpu": 1, # Assume 1 cpu per slot. "extra_cpu": num_hosts * num_slots, # Assume 1 gpu per slot. "extra_gpu": num_hosts * num_slots * int(use_gpu) }, name="tune_mnist") print("Best hyperparameters found were: ", analysis.best_config)
def train_tune(config, dim1, dim2, dim3, dim4, train_set=None, val_set=None, test_set=None, num_epochs=300, num_cpus=24, num_gpus=1, model_name="model"): model = MIX(config=config, dim1=dim1, dim2=dim2, dim3=dim3, dim4=dim4, train_set=train_set, val_set=val_set, test_set=test_set, name=model_name) trainer = pl.Trainer( max_epochs=num_epochs, gpus=num_gpus, logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"), progress_bar_refresh_rate=20, callbacks=[ TuneReportCallback({ "loss": "avg_val_loss", }, on="validation_end"), EarlyStopping(monitor="avg_val_loss") ], precision=16, ) trainer.fit(model)
def fn_tune(config, config_init, num_epochs=10, num_gpus=1): gc.collect() config_init["batch_size"] = config["batch_size"] #config_init["learning_rate"] = config["learning_rate"] config_init["hidden_size"] = config["hidden_size"] config_init["bidirectional"] = config["bidirectional"] config_init["embed_dim"] = config["embed_dim"] config_init["num_layers"] = config["num_layers"] args = parser.parse_args(serialize_config(config_init)) fx_dm = RaceDataModule(args, collate_fn) fx_dm.setup() fx_model = RaceModule(args, batch_fn) metrics = {"loss": "val_loss"} trainer = pl.Trainer( max_epochs=num_epochs, limit_train_batches=0.25, precision=16, gpus=num_gpus, progress_bar_refresh_rate=100, callbacks=[TuneReportCallback(metrics, on="validation_end")]) trainer.fit(fx_model, fx_dm)
def tune_model(config, ptl_model, dset, train_inds, n_workers, n_val = None, val_inds = None, tune_metrics = None, mode = 'tune', **trainer_kwargs): ''' A generic function to hp-tuning and model training with ray and pytorch-lightning ''' model = ptl_model(config = config) if val_inds is None: shuffle(train_inds) train_dl = DataLoader( torch.utils.data.Subset(dset, train_inds[n_val:] if val_inds is None else train_inds), batch_size = config['batch_size'], num_workers = n_workers, drop_last = True, shuffle = True ) val_dl = DataLoader( torch.utils.data.Subset(dset, train_inds[:n_val] if val_inds is None else val_inds), num_workers = n_workers, batch_size = config['batch_size'], drop_last = True, shuffle = False ) callbacks = model.callbacks if mode == 'tune': callbacks += [ TuneReportCallback( tune_metrics, on = 'validation_end' ) ] trainer = PLTrainer(callbacks = callbacks, **trainer_kwargs) trainer.fit(model, train_dl, val_dl) return trainer
from ray.tune.integration.pytorch_lightning import TuneReportCallback from runners.STL_runner import STLRunner, create_checkpoint_callback from runners.MTL_runner import MTLRunner from argparse import ArgumentParser from functools import partial parser = ArgumentParser(description="A multitask learner") parser.add_argument("model_type", choices=["mtl", "stl"], help="") tune_config = { "hidden_layers": tune.choice([[1, 2], [32, 64]]), "activations": tune.choice([["relu", "relu", "relu"], ["sigmoid", "sigmoid", "sigmoid"]]) } callback = TuneReportCallback({"loss": "loss_validate"}, on="validation_end") def run(self, learner, max_epochs=10, callbacks=None): trainer = pl.Trainer( max_epochs=max_epochs, logger=learner.logger, checkpoint_callback=create_checkpoint_callback( learner.checkpoints_prefix), callbacks=callbacks, ) trainer.fit(learner.model, learner.data_module) trainer.test(learner.model, datamodule=learner.data_module) def main():
) # CREATE CHECKPOINTS DIR checkpoint_dir = f'checkpoints/{experiment_name}' os.makedirs(checkpoint_dir) # RUN TRAINER trainer = pl.Trainer( logger=logger, log_every_n_steps=1, max_epochs=10, val_check_interval=0.05, # for dev progress_bar_refresh_rate=1, callbacks=[ TuneReportCallback(metrics={ "loss": "val_loss", }, on="validation_end"), TuneReportCheckpointCallback( metrics={"loss": "val_loss"}, filename= f"{checkpoint_dir}/latest_checkpoint.ckpt", # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune] on="validation_end"), SampleReconstructionCallback(loader=val_loader) ]) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader) test_result = Trainer.test(model=model, test_dataloaders=test_loader, verbose=True)
# the following values are the results of fine-tuning "lr": tune.choice([1.57513e-05]), "batch_size": tune.choice([32]), "max_seq_length": tune.choice([48]), "hidden_dropout_prob": tune.choice([.1]), "hidden_act": tune.choice(["gelu"]), "architecture": { "tokenizer": BertTokenizer, "model": BertForMultipleChoice, "pretrained_model_name": "bert-base-uncased" } } ray_tune_callback = TuneReportCallback( { "loss": "val_epoch_loss", "acc": "val_epoch_acc" }, on="validation_end") logger = TensorBoardLogger('tb_logs/', name='csqa') # tr = train_tune(config, logger, epochs=1, gpus=1) analysis = tune.run( partial( train_tune, logger=logger, epochs=3, gpus=config["use_gpu"], ), config=rt_config, num_samples=1, resources_per_trial={ "cpu": 10, "gpu": config["use_gpu"],
from functools import partial from argparse import ArgumentParser import torch import pytorch_lightning as lit from ray.tune.integration.pytorch_lightning import TuneReportCallback from ray import tune from .lightningreapp import LightningReapp parser = ArgumentParser() callback_tuner = TuneReportCallback( { "loss": "val_loss", # "mean_accuracy": "val_accuracy" }, on="validation_end", ) default_tune_config = { "lr": tune.loguniform(1e-4, 1e-1), # loguniform samples by magnitude "hidden_layer_size": tune.quniform(10, 50, 1) } ### TUNING HYPERPARAMETERS def train_tune(config, **tuner_kwargs): model = LightningReapp(config) max_epochs = tuner_kwargs.get('max_epochs', 10)
def trainable(config, train_loader, val_loader, test_loader): input_size = 28 ae_arch = architecture.get_ae_architecture(input_size=input_size, latent_dim=128) # model = ConvAutoencoder(**{**ae_arch, 'verbose': True}) model = ConvAutoencoder( **{ **ae_arch, 'optimizer_name': config['optimizer_name'], 'lr': config['lr'] }) model.logdir = 'ConvAutoencoder' model.set_latent(input_size) # print('model latent dim:', model.latent_size) config_str = json.dumps({ **config, 'channels': ae_arch['encoder_channels'], 'stride': ae_arch['encoder_stride'], 'kernel_size': ae_arch['encoder_kernel_size'], 'latent_dim': model.latent_size }) # SET UP LOGGER section_name = 'ConvAutoencoder' save_dir = f'{os.path.expanduser("~")}/ai-core/Embedder/runs/{section_name}/' # save_dir =f'{os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")}/runs/{section_name}/' if not os.path.exists(save_dir): os.makedirs(save_dir) # print(save_dir) # print(__name__) # print(__file__) # sdfcds experiment_name = f'ConvAutoencoder-{config_str}-{time()}' model.experiment_name = experiment_name logger = pl.loggers.TensorBoardLogger( save_dir=save_dir, name=experiment_name, default_hp_metric=False, ) # CREATE CHECKPOINTS DIR checkpoint_dir = f'checkpoints/{experiment_name}' os.makedirs(checkpoint_dir) # RUN TRAINER trainer = pl.Trainer( logger=logger, log_every_n_steps=1, max_epochs=10, val_check_interval=0.05, # for dev progress_bar_refresh_rate=0, callbacks=[ TuneReportCallback(metrics={ "loss": "val_loss", }, on="validation_end"), TuneReportCheckpointCallback( metrics={"loss": "val_loss"}, filename= f"{checkpoint_dir}/latest_checkpoint.ckpt", # TODO edit callback so that it saves history of checkpoints and make PR to ray[tune] on="validation_end"), SampleReconstructionCallback(loader=val_loader) ]) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader) test_result = Trainer.test(model=model, test_dataloaders=test_loader, verbose=True)
def get_trainer_from_cfg(cfg: DictConfig, lightning_module, stopper, profiler: str = None) -> pl.Trainer: """Gets a PyTorch Lightning Trainer from a configuration Supports: automatic batch sizing Automatic learning rate finding (experimental) Callback instantiation Logging, both to disk and with TensorBoard Parameters ---------- cfg : DictConfig configuration lightning_module : pl.LightningModule Lightning model to train stopper : callable Method to stop training. Must be passed so that figuring out batch size does not "count" towards stopping profiler : str, optional https://pytorch-lightning.readthedocs.io/en/latest/advanced/profiler.html, by default None Returns ------- pl.Trainer https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html """ steps_per_epoch = cfg.train.steps_per_epoch for split in ['train', 'val', 'test']: steps_per_epoch[split] = steps_per_epoch[split] if steps_per_epoch[ split] is not None else 1.0 # reload_dataloaders_every_epoch = True: a bit slower, but enables validation dataloader to get the new, automatic # learning rate schedule. if cfg.compute.batch_size == 'auto' or cfg.train.lr == 'auto': trainer = pl.Trainer(gpus=[cfg.compute.gpu_id], precision=16 if cfg.compute.fp16 else 32, limit_train_batches=1.0, limit_val_batches=1.0, limit_test_batches=1.0, num_sanity_val_steps=0) # callbacks=[ExampleImagesCallback()]) tmp_metrics = lightning_module.metrics tmp_workers = lightning_module.hparams.compute.num_workers # visualize_examples = lightning_module.visualize_examples if lightning_module.model_type != 'sequence': # there is a somewhat common error that VRAM will be maximized by the gpu-auto-tuner. # However, during training, we probabilistically sample colorspace transforms; in an "unlucky" # batch, perhaps all of the training samples are converted to HSV, hue and saturation changed, then changed # back. This is rare enough to not be encountered in "auto-tuning," so we'll get a train-time error. BAD! # so, we crank up the colorspace augmentation probability, then pick batch size, then change it back original_gpu_transforms = deepcopy(lightning_module.gpu_transforms) log.debug('orig: {}'.format(lightning_module.gpu_transforms)) original_augs = cfg.augs new_augs = deepcopy(cfg.augs) new_augs.color_p = 1.0 arch = lightning_module.hparams[lightning_module.model_type].arch mode = '2d' gpu_transforms = get_gpu_transforms( new_augs, '3d' if '3d' in arch.lower() else '2d') lightning_module.gpu_transforms = gpu_transforms log.debug('new: {}'.format(lightning_module.gpu_transforms)) tuner = pl.tuner.tuning.Tuner(trainer) # hack for lightning to find the batch size cfg.batch_size = 2 # to start empty_metrics = EmptyMetrics() # don't store metrics when batch size finding lightning_module.metrics = empty_metrics # don't visualize our model inputs when batch size finding # lightning_module.visualize_examples = False should_viz = cfg.train.viz_examples lightning_module.hparams.train.viz_examples = 0 # dramatically reduces RAM usage by this process lightning_module.hparams.compute.num_workers = min(tmp_workers, 1) if cfg.compute.batch_size == 'auto': max_trials = int(math.log2(cfg.compute.max_batch_size)) - int( math.log2(cfg.compute.min_batch_size)) log.info('max trials: {}'.format(max_trials)) new_batch_size = trainer.tuner.scale_batch_size( lightning_module, mode='power', steps_per_trial=30, init_val=cfg.compute.min_batch_size, max_trials=max_trials) cfg.compute.batch_size = new_batch_size log.info('auto-tuned batch size: {}'.format(new_batch_size)) if cfg.train.lr == 'auto': lr_finder = trainer.tuner.lr_find(lightning_module, early_stop_threshold=None, min_lr=1e-6, max_lr=10.0) # log.info(lr_finder.results) plt.style.use('seaborn') fig = lr_finder.plot(suggest=True, show=False) viz.save_figure(fig, 'auto_lr_finder', False, 0, overwrite=False) plt.close(fig) new_lr = lr_finder.suggestion() log.info('auto-tuned learning rate: {}'.format(new_lr)) cfg.train.lr = new_lr lightning_module.lr = new_lr lightning_module.hparams.lr = new_lr del trainer, tuner # restore lightning module to original state lightning_module.hparams.train.viz_examples = should_viz lightning_module.metrics = tmp_metrics lightning_module.hparams.compute.num_workers = tmp_workers if lightning_module.model_type != 'sequence': lightning_module.gpu_transforms = original_gpu_transforms log.debug('reverted: {}'.format(lightning_module.gpu_transforms)) key_metric = lightning_module.metrics.key_metric mode = 'min' if 'loss' in key_metric else 'max' monitor = f'val/{key_metric}' dirpath = os.path.join(cfg.run.dir, 'lightning_checkpoints') callback_list = [ FPSCallback(), MetricsCallback(), ExampleImagesCallback(), CheckpointCallback(), StopperCallback(stopper), pl.callbacks.ModelCheckpoint(dirpath=dirpath, save_top_k=1, save_last=True, mode=mode, monitor=monitor, save_weights_only=True) ] if 'tune' in cfg and cfg.tune.use and ray: callback_list.append( TuneReportCallback(OmegaConf.to_container(cfg.tune.metrics), on='validation_end')) # https://docs.ray.io/en/master/tune/tutorials/tune-pytorch-lightning.html tensorboard_logger = pl.loggers.tensorboard.TensorBoardLogger( save_dir=get_trial_dir(), name="", version=".", default_hp_metric=False) refresh_rate = 0 else: tensorboard_logger = pl.loggers.tensorboard.TensorBoardLogger( os.getcwd()) refresh_rate = 1 # tuning messes with the callbacks trainer = pl.Trainer(gpus=[cfg.compute.gpu_id], precision=16 if cfg.compute.fp16 else 32, limit_train_batches=steps_per_epoch['train'], limit_val_batches=steps_per_epoch['val'], limit_test_batches=steps_per_epoch['test'], logger=tensorboard_logger, max_epochs=cfg.train.num_epochs, num_sanity_val_steps=0, callbacks=callback_list, reload_dataloaders_every_epoch=True, progress_bar_refresh_rate=refresh_rate, profiler=profiler) torch.cuda.empty_cache() # gc.collect() # import signal # signal.signal(signal.SIGTERM, signal.SIG_DFL) # log.info('trainer is_slurm_managing_tasks: {}'.format(trainer.is_slurm_managing_tasks)) return trainer