def distribute_run(self): strategy = tf.distribute.MirroredStrategy() train_global_batch = self.args.train_batch * strategy.num_replicas_in_sync val_global_batch = self.args.val_batch * strategy.num_replicas_in_sync train_date, train_batch_num, val_data, val_batch_num = get_datasets( name=self.args.dataset, train_batch=train_global_batch, val_batch=val_global_batch) with strategy.scope(): model = get_net(arch=self.args.arch, num_layers=self.args.num_layers, num_experts=self.args.num_experts, num_classes=self.args.num_classes) model.build(input_shape=(None, 32, 32, 3)) model.summary() optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr, momentum=0.9, decay=0.0001, nesterov=True) dis_trainer = DisTrainer(strategy=strategy, model=model, optimizer=optimizer, epochs=self.args.epochs, val_data=val_data, train_batch=self.args.train_batch, val_batch=self.args.val_batch, train_data=train_date, log_dir=self.log_dir, model_save_path=self.model_save_path, train_batch_num=train_batch_num, val_batch_num=val_batch_num) dis_trainer(resume=self.args.resume, val=self.args.val)
def train_batch(epoch, net, opt, crit, batch_size): train_set = get_datasets('/home/fernand/math/data', 'train') train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=6, collate_fn=collate_data, drop_last=True) pbar = tqdm(iter(train_loader)) moving_loss = 0.0 for questions, questions_len, answers, answers_len, answer_mappings in pbar: questions, questions_len = questions.to(DEVICE), questions_len.to( DEVICE) answers, answers_len = answers.to(DEVICE), answers_len.to(DEVICE) answer_mappings = answer_mappings.to(DEVICE) loss = net.train_batch(questions, questions_len, answers, answers_len, answer_mappings, opt, crit) if moving_loss == 0.0: moving_loss = loss else: moving_loss = 0.95 * moving_loss + 0.05 * loss nn.utils.clip_grad_value_(net.parameters(), 0.1) pbar.set_description('Epoch: {}; Loss: {:.5f}'.format( epoch + 1, moving_loss)) for d in train_set.datasets: d.close()
def run(self): train_date, train_batch_num, val_data, val_batch_num = get_datasets( name=self.args.dataset, train_batch=self.args.train_batch, val_batch=self.args.val_batch) model = get_net(arch=self.args.arch, num_layers=self.args.num_layers, num_experts=self.args.num_experts, num_classes=self.args.num_classes) model.build(input_shape=(None, 32, 32, 3)) model.summary() optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr, momentum=0.9, decay=0.0001, nesterov=True) trainer = Trainer(model=model, optimizer=optimizer, epochs=self.args.epochs, val_data=val_data, train_batch=self.args.train_batch, val_batch=self.args.val_batch, train_data=train_date, log_dir=self.log_dir, model_save_path=self.model_save_path, train_batch_num=train_batch_num, val_batch_num=val_batch_num) trainer(resume=self.args.resume, val=self.args.val)
def main(arch: str, image_folder: str, from_scratch: bool = False, batch_size: Optional[int] = None, from_model: Optional[str] = None, grad_accu: int = 1, num_gpus: int = 1, epochs: int = 100, lr: float = 4e-4): if arch.startswith("BiT"): base_model = BIT_MODELS[arch](head_size=-1) if not from_scratch and not from_model: print("Loading pretrained model...") base_model.load_from(np.load(f"cache/pretrained/{arch}.npz")) net_final_size = base_model.width_factor * 2048 else: raise ValueError(f"arch '{arch}'' not supported") train_ds, valid_ds = get_datasets(image_folder, val_ratio=0.05) model = SelfSupervisedLearner(base_model, train_ds, valid_ds, epochs, lr, num_gpus=num_gpus, batch_size=batch_size if batch_size else 4, image_size=IMAGE_SIZE, projection_size=256, projection_hidden_size=4096, net_final_size=net_final_size, moving_average_decay=0.99) trainer = pl.Trainer( accelerator='ddp' if num_gpus > 1 else None, amp_backend="apex", amp_level='O2', precision=16, gpus=[1], # num_gpus, val_check_interval=0.5, # gradient_clip_val=10, max_epochs=epochs, ) trainer.fit(model)
def main(): logging.basicConfig(level=logging.INFO) in_features = 28 * 28 hidden_dim = 1024 out_features = 10 batch_size = 128 num_epochs = 10 mlp = MLP.new(in_features, hidden_dim, out_features) # mlp = Linear.new(in_features, hidden_dim) rng = mlp.initialize() dloss_fn = jax.value_and_grad(loss_fn, has_aux=True) dloss_fn = jax.jit(dloss_fn) train_ds, val_ds = get_datasets() train_dl = NumpyLoader( train_ds, batch_size=batch_size, shuffle=True, ) val_dl = NumpyLoader(val_ds, batch_size=batch_size) for epoch in range(num_epochs): for x, y in train_dl: x, y = np.array(x), np.array(y) (loss, (mlp, acc)), grads = dloss_fn(mlp, x, y) # # print('grad:', grad) print('train loss:', loss) print('train acc:', acc) mlp = jax.tree_multimap(sgd, mlp, grads) for x, y in val_dl: x, y = np.array(x), np.array(y) # out = mlp(x) loss, (mlp, acc) = loss_fn(mlp, x, y) print('val loss:', loss) print('val acc:', acc)
def train_one(epoch, net, opt, crit): train_set = get_datasets('/home/fernand/math/data', 'train') train_loader = torch.utils.data.DataLoader(train_set, shuffle=True, num_workers=6) pbar = tqdm(iter(train_loader)) moving_loss = 0.0 for question, answer in pbar: question, answer = question.to(DEVICE), answer.to(DEVICE) loss = net.train(question, answer, opt, crit) if moving_loss == 0.0: moving_loss = loss else: moving_loss = 0.9999 * moving_loss + 0.0001 * loss nn.utils.clip_grad_value_(net.parameters(), 0.1) pbar.set_description('Epoch: {}; Loss: {:.5f}'.format( epoch + 1, moving_loss)) for d in train_set.datasets: d.close()
def main(): datasets = get_datasets() min_points = 5 eps = [20, 17, 11, 4] for i, dataset in enumerate(datasets): # Plot kdist plot to determine EPS param kdist_data = get_kdist_data(dataset, min_points) plot_data(kdist_data) # Get dbscan object dbscan = DBSCAN(min_points, eps[i]) labels = dbscan.fit(dataset) print_labels(labels) plot_labeled_data(dataset, labels)
def main(): with tf.Graph().as_default(): test_sets = dataset.get_datasets(main_path, EPIWidth, disp_precision, 'test') images_placeholder_v = tf.placeholder(tf.float32, shape=(None, 9, EPIWidth, 1)) images_placeholder_u = tf.placeholder(tf.float32, shape=(None, 9, EPIWidth, 1)) prop_placeholder = tf.placeholder('float') phase_train = tf.placeholder(tf.bool, name='phase_train') logits = network.inference_ds(images_placeholder_u, images_placeholder_v, prop_placeholder, phase_train, EPIWidth, disp_precision) eval = network.evaluation(logits) saver = tf.train.Saver(tf.global_variables()) gpu_option = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_option)) sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt: # saver.restore(sess,checkpoint_path+'/model.ckpt')#利用不同平台的训练结果 saver.restore(sess, ckpt.model_checkpoint_path) #本地训练的结果 print("restore from checkpoint!") else: print("no checkpoint found!") print('Training Data Eval:') do_eval_true(sess, eval, logits, images_placeholder_u, images_placeholder_v, prop_placeholder, phase_train, test_sets)
def __init__(self, args): # Training configurations self.method = args.method self.dataset = args.dataset self.dim = args.dim self.lr_init = args.lr_init self.gamma_m = args.gamma_m self.gamma_s = args.gamma_s self.batch_size = args.batch_size self.val_batch_size = self.batch_size // 2 self.iteration = args.iteration self.evaluation = args.evaluation self.show_iter = 1000 self.update_epoch = args.update_epoch self.balanced = args.balanced self.instances = args.instances self.inter_test = args.intertest self.cm = args.cm self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') self.n_class = args.batch_size // args.instances self.classes = args.classes self.pretrained = args.pretrained self.model_save_interval = args.model_save_interval self.file_name = '{}_{}_{}'.format( self.method, self.dataset, self.iteration, ) print('========================================') print(json.dumps(vars(args), indent=2)) print(self.file_name) # Paths self.root_dir = os.path.join('/', 'data') self.data_dir = os.path.join(self.root_dir, self.dataset) self.model_dir = self._get_path('./trained_model') self.plot_dir = self._get_path('./plot_model') self.code_dir = self._get_path(os.path.join('codes', self.dataset)) self.fig_dir = self._get_path(os.path.join('fig', self.dataset, self.file_name)) # Preparing data self.transforms = get_transform() self.datasets = get_datasets(dataset=self.dataset, data_dir=self.data_dir, transforms=self.transforms) self.data_loaders = get_data_loaders( datasets=self.datasets, batch_size=self.batch_size, val_batch_size=self.val_batch_size, n_instance=self.instances, balanced=self.balanced, #cm=self.cm_sampler if self.cm else None ) self.dataset_sizes = {x: len(self.datasets[x]) for x in ['train', 'test']} self.mean = (torch.zeros((self.classes,self.classes)).add(1.5)-1.0*torch.eye(self.classes)).to(self.device) self.std = (torch.zeros((self.classes,self.classes)).add(0.15)).to(self.device) self.last_delta_mean = torch.zeros((self.classes,self.classes)).to(self.device) self.last_delta_std = torch.zeros((self.classes,self.classes)).to(self.device) self.ndmodel = nd.NDfdml(n_class=self.n_class,batch_size=self.batch_size,instances=self.instances,pretrained=self.pretrained).to(self.device) optimizer_c = optim.SGD( [ {'params': self.ndmodel.googlelayer.parameters()}, {'params': self.ndmodel.embedding_layer.parameters(), 'lr': self.lr_init * 10, 'momentum': 0.9} ], lr=self.lr_init, momentum=0.9 ) self.scheduler = lr_scheduler.StepLR(optimizer_c, step_size=4000, gamma=0.9)
def main(args): checks() macn = BatchMACN( image_shape=[FLAGS.im_h, FLAGS.im_w, FLAGS.ch_i], vin_config=VINConfig(k=FLAGS.k, ch_h=FLAGS.ch_h, ch_q=FLAGS.ch_q), access_config={ "memory_size": FLAGS.memory_size, "word_size": FLAGS.word_size, "num_reads": FLAGS.num_read_heads, "num_writes": FLAGS.num_write_heads }, controller_config={ "hidden_size": FLAGS.hidden_size }, batch_size=FLAGS.batch_size, seq_length=FLAGS.seq_length ) # y = [batch, labels] y = tf.placeholder(tf.int64, shape=[None, None], name='y') # labels : actions {0,1,2,3} # Training cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=macn.logits, name='cross_entropy') loss = tf.reduce_sum(cross_entropy, name='cross_entropy_mean') train_step = tf.train.RMSPropOptimizer(FLAGS.learning_rate, epsilon=1e-6, centered=True).minimize(loss) # Reporting y_ = tf.argmax(macn.prob_actions, axis=-1) # predicted action nb_errors = tf.reduce_sum(tf.to_float(tf.not_equal(y_, y))) # Number of wrongly selected actions def train_on_episode_batch(batch_images, batch_labels): _, _loss, _nb_err = sess.run([train_step, loss, nb_errors], feed_dict={macn.X : batch_images, y : batch_labels}) return _loss, _nb_err def test_on_episode_batch(batch_images, batch_labels): return sess.run([loss, nb_errors], feed_dict={macn.X : batch_images, y : batch_labels}) trainset, testset = get_datasets(FLAGS.dataset, test_percent=0.1) # Start training saver = tf.train.Saver() with tf.Session() as sess: if loadfile_exists(FLAGS.load): saver.restore(sess, FLAGS.load) print("Weights reloaded") else: sess.run(tf.global_variables_initializer()) print("Start training...") for epoch in range(1, FLAGS.epochs + 1): start_time = time.time() mean_loss, mean_accuracy = compute_on_dataset(sess, trainset, train_on_episode_batch) print('Epoch: {:3d} ({:.1f} s):'.format(epoch, time.time() - start_time)) print('\t Train Loss: {:.5f} \t Train accuracy: {:.2f}%'.format(mean_loss, 100*(mean_accuracy))) saver.save(sess, FLAGS.save) print('Training finished.') print('Testing...') mean_loss, mean_accuracy = compute_on_dataset(sess, testset, test_on_episode_batch) print('Test Accuracy: {:.2f}%'.format(100*(mean_accuracy)))
def main( arch: str, image_folder: str, from_scratch: bool = False, batch_size: Optional[int] = None, from_model: Optional[str] = None, grad_accu: int = 1, num_gpus: int = 1, epochs: int = 100, lr: float = 4e-4): pl.seed_everything(int(os.environ.get("SEED", 738))) if arch.startswith("BiT"): base_model = BIT_MODELS[arch](head_size=-1) if not from_scratch and not from_model: print("Loading pretrained model...") base_model.load_from(np.load(f"cache/pretrained/{arch}.npz")) net_final_size = base_model.width_factor * 2048 else: raise ValueError(f"arch '{arch}'' not supported") train_ds, valid_ds = get_datasets(image_folder, val_ratio=0.05) model = SelfSupervisedLearner( base_model, train_ds, valid_ds, epochs, lr, num_gpus=num_gpus, batch_size=batch_size if batch_size else 4, image_size=IMAGE_SIZE, projection_size=256, projection_hidden_size=4096, net_final_size=net_final_size, moving_average_decay=0.995, use_momentum=True ) if from_model: print("loading weights...") # Load pretrained-weights weights = torch.load(from_model) model.learner.online_encoder.projector.load_state_dict( weights["online_encoder_proj"]) model.learner.online_encoder.net.load_state_dict( weights["online_encoder_net"]) model.learner.online_predictor.load_state_dict( weights["online_predictor"]) model.learner.target_encoder.net.load_state_dict( weights["target_encoder_net"]) model.learner.target_encoder.projector.load_state_dict( weights["target_encoder_proj"]) del weights trainer = pl.Trainer( accelerator='ddp' if num_gpus > 1 else None, amp_backend="apex", amp_level='O2', precision=16, gpus=num_gpus, val_check_interval=0.5, gradient_clip_val=10, max_epochs=epochs, callbacks=[ LearningRateMonitor(logging_interval='step'), ModelCheckpoint( monitor='val_loss', filename='byol-{step:06d}-{val_loss:.4f}', save_top_k=2) ], accumulate_grad_batches=grad_accu, auto_scale_batch_size='power' if batch_size is None else None, # automatic_optimization=False ) if batch_size is None: trainer.tune(model) trainer.fit(model) # model = SelfSupervisedLearner.load_from_checkpoint( # "lightning_logs/version_20/checkpoints/byol-step=001135-val_loss=0.03.ckpt", # net=base_model, # train_dataset=train_ds, # valid_dataset=valid_ds, # epochs=epochs, # learning_rate=lr, # augment_fn=torch.nn.Sequential( # T.RandomResizedCrop((IMAGE_SIZE, IMAGE_SIZE)), # RandomApply( # T.ColorJitter(0.8, 0.8, 0.8, 0.2), # p=0.3 # ), # T.RandomGrayscale(p=0.2), # T.RandomHorizontalFlip(), # RandomApply( # T.GaussianBlur((3, 3), (1.0, 2.0)), # p=0.2 # ), # T.Normalize( # mean=torch.tensor([0.485, 0.456, 0.406]), # std=torch.tensor([0.229, 0.224, 0.225]) # ) # ), # num_gpus=num_gpus, # batch_size=batch_size if batch_size else 4, # image_size=IMAGE_SIZE, # hidden_layer=-1, # projection_size=256, # projection_hidden_size=4096, # net_final_size=net_final_size, # moving_average_decay=0.99 # ) # trainer = pl.Trainer( # resume_from_checkpoint="lightning_logs/version_20/checkpoints/byol-step=001135-val_loss=0.03.ckpt") if num_gpus == 1 or torch.distributed.get_rank() == 0: torch.save({ "online_encoder_proj": model.learner.online_encoder.projector.state_dict(), "online_encoder_net": model.learner.online_encoder.net.state_dict(), "online_predictor": model.learner.online_predictor.state_dict(), "target_encoder_net": model.learner.target_encoder.net.state_dict(), "target_encoder_proj": model.learner.target_encoder.projector.state_dict(), "config": { "arch": arch } }, f"cache/byol_{arch}.pth") print("Model saved")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=42, help='Random seed') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Data directory') parser.add_argument('-l', '--loss', type=str, default='label_smooth_cross_entropy') parser.add_argument('-t1', '--temper1', type=float, default=0.2) parser.add_argument('-t2', '--temper2', type=float, default=4.0) parser.add_argument('-optim', '--optimizer', type=str, default='adam') parser.add_argument('-prep', '--prep_function', type=str, default='none') parser.add_argument('--train_on_different_datasets', action='store_true') parser.add_argument('--use-current', action='store_true') parser.add_argument('--use-extra', action='store_true') parser.add_argument('--use-unlabeled', action='store_true') parser.add_argument('--fast', action='store_true') parser.add_argument('--mixup', action='store_true') parser.add_argument('--balance', action='store_true') parser.add_argument('--balance-datasets', action='store_true') parser.add_argument('--show', action='store_true') parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('-m', '--model', type=str, default='efficientnet-b4', help='') parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch Size during training, e.g. -b 64') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-s', '--sizes', default=380, type=int, help='Image size for training & inference') parser.add_argument('-f', '--fold', type=int, default=None) parser.add_argument('-t', '--transfer', default=None, type=str, help='') parser.add_argument('-lr', '--learning_rate', type=float, default=1e-4, help='Initial learning rate') parser.add_argument('-a', '--augmentations', default='medium', type=str, help='') parser.add_argument('-accum', '--accum-step', type=int, default=1) parser.add_argument('-metric', '--metric', type=str, default='accuracy01') args = parser.parse_args() diff_dataset_train = args.train_on_different_datasets data_dir = args.data_dir epochs = args.epochs batch_size = args.batch_size seed = args.seed loss_name = args.loss optim_name = args.optimizer prep_function = args.prep_function model_name = args.model size = args.sizes, print(size) print(size[0]) image_size = (size[0], size[0]) print(image_size) fast = args.fast fold = args.fold mixup = args.mixup balance = args.balance balance_datasets = args.balance_datasets show_batches = args.show verbose = args.verbose use_current = args.use_current use_extra = args.use_extra use_unlabeled = args.use_unlabeled learning_rate = args.learning_rate augmentations = args.augmentations transfer = args.transfer accum_step = args.accum_step #cosine_loss accuracy01 main_metric = args.metric print(data_dir) num_classes = 5 assert use_current or use_extra print(fold) current_time = datetime.now().strftime('%b%d_%H_%M') random_name = get_random_name() current_time = datetime.now().strftime('%b%d_%H_%M') random_name = get_random_name() # if folds is None or len(folds) == 0: # folds = [None] torch.cuda.empty_cache() checkpoint_prefix = f'{model_name}_{size}_{augmentations}' if transfer is not None: checkpoint_prefix += '_pretrain_from_' + str(transfer) else: if use_current: checkpoint_prefix += '_current' if use_extra: checkpoint_prefix += '_extra' if use_unlabeled: checkpoint_prefix += '_unlabeled' if fold is not None: checkpoint_prefix += f'_fold{fold}' directory_prefix = f'{current_time}_{checkpoint_prefix}' log_dir = os.path.join('runs', directory_prefix) os.makedirs(log_dir, exist_ok=False) set_manual_seed(seed) model = get_model(model_name) if transfer is not None: print("Transfering weights from model checkpoint") model.load_state_dict(torch.load(transfer)['model_state_dict']) model = model.cuda() if diff_dataset_train: train_on = ['current_train', 'extra_train'] valid_on = ['unlabeled'] train_ds, valid_ds, train_sizes = get_datasets_universal( train_on=train_on, valid_on=valid_on, image_size=image_size, augmentation=augmentations, target_dtype=int, prep_function=prep_function) else: train_ds, valid_ds, train_sizes = get_datasets( data_dir=data_dir, use_current=use_current, use_extra=use_extra, image_size=image_size, prep_function=prep_function, augmentation=augmentations, target_dtype=int, fold=fold, folds=5) train_loader, valid_loader = get_dataloaders(train_ds, valid_ds, batch_size=batch_size, train_sizes=train_sizes, num_workers=6, balance=True, balance_datasets=True, balance_unlabeled=False) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader runner = SupervisedRunner(input_key='image') criterions = get_loss(loss_name) # criterions_tempered = TemperedLogLoss() # optimizer = catalyst.contrib.nn.optimizers.radam.RAdam(model.parameters(), lr = learning_rate) optimizer = get_optim(optim_name, model, learning_rate) # optimizer = catalyst.contrib.nn.optimizers.Adam(model.parameters(), lr = learning_rate) # criterions = nn.CrossEntropyLoss() # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25], gamma=0.8) # cappa = CappaScoreCallback() Q = math.floor(len(train_ds) / batch_size) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Q) if main_metric != 'accuracy01': callbacks = [ AccuracyCallback(num_classes=num_classes), CosineLossCallback(), OptimizerCallback(accumulation_steps=accum_step), CheckpointCallback(save_n_best=epochs) ] else: callbacks = [ AccuracyCallback(num_classes=num_classes), OptimizerCallback(accumulation_steps=accum_step), CheckpointCallback(save_n_best=epochs) ] # main_metric = 'accuracy01' runner.train( fp16=True, model=model, criterion=criterions, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=log_dir, num_epochs=epochs, verbose=verbose, main_metric=main_metric, minimize_metric=False, )
from dataset import get_datasets loss_func = get_loss_func(args) metric_func = get_metric_func(args) device = torch.device( "cuda" if args.gpu and torch.cuda.is_available() else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if args.gpu else {} print('Device:', device) # print('Task:', args.task) print('Setting:', args.setting) if args.spdz: print('Using SPDZ for FedAvg') print('Local epochs:', args.local_epochs) trn_party_datasets, val_party_datasets, tst_party_datasets = get_datasets( args) # each party's loader and the combined loader assert len(trn_party_datasets) == len(val_party_datasets) and len( val_party_datasets) == len(tst_party_datasets) trn_combined_dataset = ConcatDataset(trn_party_datasets) num_parties = len(trn_party_datasets) assert num_parties > 1 if args.dp: party_samplers = [ RandomSampler(trnset, replacement=True) for trnset in trn_party_datasets ] combined_sampler = RandomSampler(trn_combined_dataset, replacement=True) trn_party_loaders = [DataLoader(trnset, sampler=sampler, batch_size=args.batch_size, shuffle=False, **kwargs) \ for trnset, sampler in zip(trn_party_datasets, party_samplers)]
def hp_search(seed, data, model_type, mode, device, batch_size, embedding_dim, hidden_dim, num_layers, bidirectional, dropout, batch_first, epochs, lr, clip_grad, max_norm, early_stopping_patience, train_frac, val_frac, test_frac, subset_size, log_interval, no_tb, w_loss, w_sampling): # set seed for reproducibility on cpu or gpu based on availability torch.manual_seed(seed) if device == 'cpu' else torch.cuda.manual_seed(seed) # set starting time of full training pipeline start_time = datetime.now() # set device device = torch.device(device) print(f"Device: {device}") # data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv' if data == 'bnc' else 'data/blogs_kaggle/blogtext.csv' if data == 'bnc': data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv' elif data == 'bnc_rb': data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv' else: data_path = 'data/blogs_kaggle/blogtext.csv' print("Starting data preprocessing ... ") data_prep_start = datetime.now() # Load data and create dataset instances train_dataset, val_dataset, test_dataset = get_datasets(subset_size=subset_size, file_path=data_path, train_frac=train_frac, val_frac=val_frac, test_frac=test_frac, seed=seed, data=data) print('-' * 91) print('BASELINES//VALUE COUNTS') print('Train') print(train_dataset.df['age_cat'].value_counts(normalize=True)) print('Validation') print(val_dataset.df['age_cat'].value_counts(normalize=True)) print('-' * 91) # Train, val, and test splits # train_size = int(train_frac * len(dataset)) # test_size = len(dataset) - train_size # train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size]) # get vocab size number of classes # vocab_size = train_dataset.vocab_size # num_classes = train_dataset.num_classes # create dataloaders with pre-specified batch size # data_loader = DataLoader(dataset=dataset, # batch_size=batch_size, # shuffle=True, # collate_fn=PadSequence()) print('-' * 91) print(f'Data preprocessing finished. Data prep took {datetime.now() - data_prep_start}.') print('######### DATA STATS ###############') print(f'Number of classes: {train_dataset.num_classes}') print(f'Vocabulary size: {train_dataset.vocab_size}') print(f'Training set size: {train_dataset.__len__()}') print(f'Validation set size: {val_dataset.__len__()}') print(f'Test set size: {test_dataset.__len__()}') print('-' * 91) # Set hyperparameters for grid search* # seeds = [0, 1, 2] lrs = [1e-5, 1e-4, 1e-3] embedding_dims = [64, 256, 512] hidden_dims = [128, 512, 1024] nums_layers = [1, 2] bidirectionals = [False, True] # weighting = [(True, False), (False, True)] # [(w_loss = True, w_sampling = False), (w_loss = False, w_sampling = True)] # set holders for best performance metrics and corresponding hyperparameters best_metrics = {'loss' : float("inf"), 'acc' : float('-inf')} best_hps = {'lr' : None, 'embedding_dim' : None, 'hidden_dim' : None, 'num_layers': None, 'bidirectional' : None} best_model = None # TODO: what's the appropriate type for this? #TODO: add tqdm's and print statements to these loops for progress monitoring best_file_name = None best_epoch = None # For keeping track of metrics for all configs keys = ['lr', 'emb_dim', 'hid_dim', 'n_layers', 'bd', 'val_acc', 'val_loss'] df = pd.DataFrame(columns=keys) best_model_updates = -1 for lr_ in lrs: for emb_dim in embedding_dims: for hid_dim in hidden_dims: # skip if hidden size not larger than embedding dim if not hid_dim > emb_dim: continue for n_layers in nums_layers: for bd in bidirectionals: print('-' * 91) print(f"| Current config: lr: {lr_} | emb: {emb_dim} | hid_dim: {hid_dim} | n_layers: {n_layers} " f"| bd: {bd} | ") print('-' * 91) # Create detailed experiment tag for tensorboard summary writer cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S') file_name = f'lstm_emb_{emb_dim}_hid_{hid_dim}_l_{n_layers}_' \ f'bd_{bd}_drop_{dropout}_bs_{batch_size}_epochs_{epochs}_' \ f'lr_{lr_}_subset_{subset_size}_train_{train_frac}_val_{val_frac}_' \ f'test_{test_frac}_clip_{clip_grad}_maxnorm_{max_norm}' \ f'es_{early_stopping_patience}_seed_{seed}_device_{device}_dt_{cur_datetime}' if not no_tb: # # Create detailed experiment tag for tensorboard summary writer # cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S') log_dir = f'runs/hp_search/{data}/' # file_name = f'lstm_emb_{emb_dim}_hid_{hid_dim}_l_{n_layers}_' \ # f'bd_{bd}_drop_{dropout}_bs_{batch_size}_epochs_{epochs}_' \ # f'lr_{lr_}_subset_{subset_size}_train_{train_frac}_val_{val_frac}_' \ # f'test_{test_frac}_clip_{clip_grad}_maxnorm_{max_norm}' \ # f'es_{early_stopping_patience}_seed_{seed}_device_{device}_dt_{cur_datetime}' # create summary writer instance for logging log_path = log_dir+file_name writer = SummaryWriter(log_path) else: writer = None # train model (in val mode) loss, acc, model, epoch, optimizer = train(mode=mode, data=data, seed=seed, device=device, batch_size=batch_size, embedding_dim=emb_dim, hidden_dim=hid_dim, num_layers=n_layers, bidirectional=bd, dropout=dropout, batch_first=batch_first, epochs=epochs, lr=lr_, clip_grad=clip_grad, max_norm=max_norm, early_stopping_patience=early_stopping_patience, train_frac=train_frac, val_frac=val_frac, test_frac=test_frac, subset_size=subset_size, log_interval=log_interval, writer=writer, train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, no_tb=no_tb, w_loss=w_loss, w_sampling=w_sampling) if not no_tb: # close tensorboard summary writer writer.close() # Update metric logging dataframe df.loc[0 if pd.isnull(df.index.max()) else df.index.max() + 1] = [lr_] + [emb_dim] + [hid_dim] \ + [n_layers] + [bd] + [acc] + \ [loss.item()] # Save metric logging dataframe to csv # cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S') df.to_csv( f'output/{data}_lstm_hp_search_metrics.csv', index=False ) # update best ... if acc > best_metrics['acc']: best_model_updates +=1 # ... metrics best_metrics['acc'] = acc best_metrics['loss'] = loss best_epoch = epoch # ... hyperparams best_hps['lr'] = lr_ best_hps['embedding_dim'] = emb_dim best_hps['hidden_dim'] = hid_dim best_hps['num_layers'] = n_layers best_hps['bidirectional'] = bd # ... model best_model = deepcopy(model) # ... optimizer best_optimizer = deepcopy(optimizer) # filename best_file_name = file_name # Delete previous current best model checkpoint file for filename in glob.glob(f"models/{data}/lstm/cur_best_*"): os.remove(filename) # save current best model checkpoint # Save best model checkpoint model_dir = f'models/{data}/lstm/' Path(model_dir).mkdir(parents=True, exist_ok=True) model_path = model_dir + 'cur_best_' + best_file_name + '.pt' torch.save({ 'epoch': best_epoch, 'model_state_dict': best_model.state_dict(), 'optimizer_state_dict': best_optimizer.state_dict(), 'loss': best_metrics['loss'], 'acc': best_metrics['acc'] }, model_path) print("New current best model found.") print(f'Current best hyperparameters: {best_hps}') print(f'Current best model: {best_model}') print(f'Current best metrics: {best_metrics}') # # Save metric logging dataframe to csv # df.to_csv( # 'output/blog_lstm_hp_search_metrics.csv', # index=False # ) # Save best model checkpoint model_dir = f'models/{data}/lstm/' Path(model_dir).mkdir(parents=True, exist_ok=True) model_path = model_dir + 'best_' + best_file_name + '.pt' torch.save({ 'epoch': best_epoch, 'model_state_dict': best_model.state_dict(), 'optimizer_state_dict': best_optimizer.state_dict(), 'loss': best_metrics['loss'], 'acc': best_metrics['acc'] }, model_path) print("Finished hyperparameter search.") print(f'Best hyperparameters: {best_hps}') print(f'Best model: {best_model}') print(f'Best metrics: {best_metrics}') # Delete equivalent cur_best file for filename in glob.glob("models/blog/lstm/cur_best_*"): os.remove(filename) print(f"Best model updates: {best_model_updates}")
def train(seed, data, model_type, mode, device, batch_size, embedding_dim, hidden_dim, num_layers, bidirectional, dropout, batch_first, epochs, lr, clip_grad, max_norm, early_stopping_patience, train_frac, val_frac, test_frac, subset_size, log_interval, no_tb, w_loss, w_sampling, writer=None, train_dataset=None, val_dataset=None, test_dataset=None): if mode =='train' or mode == 'test': # set seed for reproducibility on cpu or gpu based on availability torch.manual_seed(seed) if device == 'cpu' else torch.cuda.manual_seed(seed) # data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv' if data == 'bnc' else 'data/blogs_kaggle/blogtext.csv' # data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv' if data == 'bnc' else 'data/blogs_kaggle/blogtext.csv' if data == 'bnc': data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv' elif data == 'bnc_rb': data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv' else: data_path = 'data/blogs_kaggle/blogtext.csv' # set starting time of full training pipeline start_time = datetime.now() # set device device = torch.device(device) print(f"Device: {device}") print("Starting data preprocessing ... ") data_prep_start = datetime.now() # Load data and create dataset instances train_dataset, val_dataset, test_dataset = get_datasets(subset_size=subset_size, file_path=data_path, train_frac=train_frac, val_frac=val_frac, test_frac=test_frac, seed=seed, data=data, model_type=model_type) # Train, val, and test splits # train_size = int(train_frac * len(dataset)) # test_size = len(dataset) - train_size # train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size]) # # get vocab size number of classes # vocab_size = train_dataset.vocab_size # num_classes = train_dataset.num_classes # create dataloaders with pre-specified batch size # data_loader = DataLoader(dataset=dataset, # batch_size=batch_size, # shuffle=True, # collate_fn=PadSequence()) print('-' * 91) print(f'Data preprocessing finished. Data prep took {datetime.now() - data_prep_start}.') print(31*'-' + ' DATASET STATS AND BASELINES ' + '-'*31) print(f'Number of classes: {train_dataset.num_classes}') print(f'Vocabulary size: {train_dataset.vocab_size}') print(f'Training set size: {train_dataset.__len__()}') print(f'Validation set size: {val_dataset.__len__()}') print(f'Test set size: {test_dataset.__len__()}') print(91 * '-') print('Baselines') print('Train') print(train_dataset.df['age_cat'].value_counts(normalize=True)) print('Validation') print(val_dataset.df['age_cat'].value_counts(normalize=True)) print('Test') print(test_dataset.df['age_cat'].value_counts(normalize=True)) print('-' * 91) if w_sampling: # Apply weighted sampling. # Inspired by: https://towardsdatascience.com/address-class-imbalance-easily-with-pytorch-e2d4fa208627 # TODO: isn't this a bit redundant? Doesn't torch.tensor(train_dataset.df['age_cat'], dtype=torch.long) do the same? all_label_ids = torch.tensor([label for label in train_dataset.df['age_cat']], dtype=torch.long) # Class weighting labels_unique, counts = np.unique(train_dataset.df['age_cat'], return_counts=True) print(f'Unique labels: {labels_unique}') class_weights = [sum(counts) / c for c in counts] # [#{class_0}, {#class_1}, etc.] # Assign weights to each input sample sampler_weights = [class_weights[label] for label in train_dataset.df['age_cat']] sampler = WeightedRandomSampler(weights=sampler_weights, num_samples=len(train_dataset.df['age_cat']), replacement=True) # Note that sampler option is mutually exclusive with shuffle. So shuffle not needed here. train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, collate_fn=PadSequence(), sampler=sampler) else: train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=PadSequence()) val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, collate_fn=PadSequence()) test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=PadSequence()) if mode == 'train' or mode == 'val': if model_type == 'lstm': # initialize model print("Initializing model ...") model = TextClassificationLSTM(batch_size = batch_size, vocab_size = train_dataset.vocab_size, embedding_dim = embedding_dim, hidden_dim = hidden_dim, num_classes = train_dataset.num_classes, num_layers = num_layers, bidirectional = bidirectional, dropout = dropout, device = device, batch_first = batch_first) elif model_type == 'bert': model = TextClassificationBERT() elif mode == 'test': if model_type == 'lstm': model, _, _, _, _ = load_saved_model(model_class=TextClassificationLSTM, optimizer_class=optim.Adam, lr=lr, device=device, batch_size=batch_size, vocab_size=train_dataset.vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=train_dataset.num_classes, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout, batch_first=batch_first) elif model_type == 'bert': model = TextClassificationBERT() # model to device model.to(device) # Print model architecture and trainable parameters print('-' * 91) print("MODEL ARCHITECTURE:") print(model) print('-' * 91) if w_loss: # Apply frequency-based weighted loss for highly imbalanced data n_samples = [train_dataset.df['age_cat'].value_counts()[label] for label in range(train_dataset.num_classes)] # Weight option 1 weights = [1 - (x / sum(n_samples)) for x in n_samples] weights = torch.FloatTensor(weights).to(device) # OR 2) have the weights sum up to 1?? # weights = torch.tensor(n_samples, dtype=torch.float32).to(device) # weights = weights / weights.sum() # weights = 1.0 / weights # weights = weights / weights.sum() criterion = torch.nn.CrossEntropyLoss(weight=weights) # combines LogSoftmax and NLL else: criterion = torch.nn.CrossEntropyLoss() # combines LogSoftmax and NLL if mode == 'train' or mode == 'val': # count trainable parameters trainable_params = sum(param.numel() for param in model.parameters() if param.requires_grad) print(f'The model has {trainable_params} trainable parameters.') # set up optimizer and loss criterion if model_type == 'lstm': optimizer = optim.Adam(params=model.parameters(), lr=lr) elif model_type == 'bert': optimizer = optim.Adam(model.parameters(), lr=2e-5) #TODO: CHANGE THIS BACK!!!!!!! # initialize iterations at zero iterations = 0 # values for model selection best_val_loss = torch.tensor(np.inf, device=device) best_val_accuracy = torch.tensor(-np.inf, device=device) best_epoch = None best_model = None # Initialize patience for early stopping patience = 0 # metrics for losses train_losses = [] train_accs = [] # disable tqdm progress bars in train and train_one_epoch if in validation mode disable_bars = mode == 'val' # for epoch in tqdm(range(epochs), disable=disable_bars): for epoch in range(epochs): epoch_start_time = datetime.now() # epoch_start_time = time.time() try: # set model to training mode. NB: in the actual training loop later on, this # statement goes at the beginning of each epoch. model.train() iterations, train_losses, train_accs = train_one_epoch(model=model, model_type=model_type, data_loader=train_loader, criterion=criterion, optimizer=optimizer, device=device, start_iteration=iterations, clip_grad=clip_grad, max_norm=max_norm, log_interval=log_interval, losses=train_losses, accs=train_accs, writer=writer, disable_bars=disable_bars, epoch=epoch) except KeyboardInterrupt: print("Manually stopped current epoch") __import__('pdb').set_trace() # print("Current epoch training took {}".format(datetime.now() - epoch_start_time)) val_loss, val_accuracy = evaluate_performance(model=model, data_loader=val_loader, device=device, criterion=criterion, writer=writer, global_iteration=iterations, print_metrics=False, data=data) # TODO: See this tutorials prettier logging -- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html # print(f"#######################################################################") # print(f"Epoch {epoch + 1} finished, validation loss: {val_loss}, val acc: {val_accuracy}") # print(f"#######################################################################") print('-' * 91) print('| end of epoch {:3d} | time: {} | ' 'valid loss {:8.5f} | valid accuracy {:8.3f} '.format(epoch + 1, (datetime.now() - epoch_start_time), val_loss, val_accuracy)) print('-' * 91) # # update best performance # if val_loss < best_val_loss: # best_val_loss = val_loss # best_val_accuracy = val_accuracy # best_model = model # best_epoch = epoch + 1 # update best performance if val_accuracy > best_val_accuracy: best_val_loss = val_loss best_val_accuracy = val_accuracy best_model = deepcopy(model) best_optimizer = deepcopy(optimizer) best_epoch = epoch + 1 patience = 0 else: patience +=1 if patience >= early_stopping_patience: print("EARLY STOPPING") break # TODO: See this tutorials prettier logging -- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html print('-' * 91) print(f"Done training and validating. Best model from epoch {best_epoch}:") print(best_model) print('-' * 91) if mode == 'val': return best_val_loss, best_val_accuracy, best_model, best_epoch, best_optimizer elif mode == 'train': print("Starting testing...") _, _ = evaluate_performance(model=best_model, data_loader=test_loader, device=device, criterion=criterion, set='test', data=data, plot_cm=True) elif mode == 'test': print("Starting testing...") _, _ = evaluate_performance(model=model, data_loader=test_loader, device=device, criterion=criterion, set='test', data=data, plot_cm=True)
parser.add_argument('--interval', type=int, default=1) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus create_exp_dir(args) mp.set_start_method("spawn") population = mp.Queue(maxsize=args.B) finish_tasks = mp.Queue(maxsize=args.B) test_outputs = mp.Queue() epoch = mp.Value('i', 0) lock = mp.Lock() resources=[] print('Using resources:') for i in range(1,len(args.gpus.split(','))): for j in range(args.num_per_gpu): resources.append(f'cuda:{i}') print(f'cuda:{i}') datasets = get_datasets() Processes = [Samples(datasets, epoch, lock, population, finish_tasks, resources[i], args) for i in range(len(resources))] Processes.append(Optimizer(datasets, epoch, lock, population, finish_tasks, 'cuda:0', args)) [p.start() for p in Processes] [p.join() for p in Processes]
def __init__(self, args): # Training configurations self.method = args.method self.dataset = args.dataset self.dim = args.dim self.lr = args.lr self.batch_size = args.batch_size self.val_batch_size = self.batch_size // 2 self.iteration = args.iteration self.evaluation = args.evaluation self.show_iter = 1000 self.update_epoch = 10 self.balanced = args.balanced self.instances = args.instances self.cm = args.cm self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') self.file_name = '{}_{}_{}'.format( self.method, self.dataset, self.lr, ) print('========================================') print(json.dumps(vars(args), indent=2)) print(self.file_name) # Paths self.root_dir = os.path.join('/', 'home', 'lyz') self.data_dir = os.path.join(self.root_dir, 'datasets', self.dataset) self.model_dir = self._get_path('./trained_model') self.code_dir = self._get_path(os.path.join('codes', self.dataset)) self.fig_dir = self._get_path( os.path.join('fig', self.dataset, self.file_name)) # Preparing data self.transforms = get_transform() self.datasets = get_datasets(dataset=self.dataset, data_dir=self.data_dir, transforms=self.transforms) self.cm_sampler = ClassMiningSampler(self.datasets['train'], batch_size=self.batch_size, n_instance=self.instances, balanced=self.balanced) self.data_loaders = get_data_loaders( datasets=self.datasets, batch_size=self.batch_size, val_batch_size=self.val_batch_size, n_instance=self.instances, balanced=self.balanced, cm=self.cm_sampler if self.cm else None) self.dataset_sizes = { x: len(self.datasets[x]) for x in ['train', 'test'] } # Set up model self.model = get_model(self.device, self.dim) self.optimizer = optim.SGD( [{ 'params': self.model.google_net.parameters() }, { 'params': self.model.linear.parameters(), 'lr': self.lr * 10, 'momentum': 0.9 }], lr=self.lr, momentum=0.9) self.scheduler = lr_scheduler.StepLR(self.optimizer, step_size=2000, gamma=0.5)
def main(): print "initial model generator" with tf.Graph().as_default(): train_sets = dataset.get_datasets(main_path, EPIWidth, disp_precision, 'train') test_sets = dataset.get_datasets(main_path, EPIWidth, disp_precision, 'test') global_step = tf.Variable(0, trainable=False) images_placeholder_v = tf.placeholder(tf.float32, shape=(None, 9, EPIWidth, 1)) images_placeholder_u = tf.placeholder(tf.float32, shape=(None, 9, EPIWidth, 1)) labels_placeholder = tf.placeholder(tf.int32, shape=None) prop_placeholder = tf.placeholder('float') phase_train = tf.placeholder(tf.bool, name='phase_train') logits = network.inference_ds(images_placeholder_u, images_placeholder_v, prop_placeholder, phase_train, disp_precision) logits_softmax = network.softmax(logits) loss = network.loss(logits_softmax, labels_placeholder) train_op = network.training(loss, 1e-4, global_step) eval = network.evaluation(logits_softmax) summary = tf.summary.merge_all() saver = tf.train.Saver(tf.global_variables()) gpu_option = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_option)) summary_writer = tf.summary.FileWriter(summary_path, sess.graph) sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt: # saver.restore(sess,checkpoint_path+'/model.ckpt')#利用不同平台的训练结果 # saver.restore(sess, ckpt.model_checkpoint_path) # 本地训练的结果 print("restore from checkpoint!") else: print("no checkpoint found!") start_time = time.time() step = 0 while not train_sets.complete: feed_dict = fill_feed_dict(train_sets, images_placeholder_u, images_placeholder_v, labels_placeholder, prop_placeholder, phase_train, 'train') _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time if step % 1000 == 0: print('Step:%d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() if step % 25000 == 24999: print('test Data Eval:') do_eval_true(sess, eval, logits_softmax, images_placeholder_u, images_placeholder_v, prop_placeholder, phase_train, test_sets) if step % 50000 == 49999: saver.save(sess, checkpoint_path + '/model.ckpt', global_step=step)
import dataset ds = dataset.get_datasets('/home/fernand/math/data', 'test') batch = [ds[0], ds[1]] print(dataset.collate_data(batch))
def main(arch: str, image_folder: str, batch_size: Optional[int] = None, from_model: Optional[str] = None, grad_accu: int = 1, steps: Optional[int] = None, num_gpus: int = 1, epochs: int = 1, lr: float = 4e-4): pl.seed_everything(int(os.environ.get("SEED", 738))) if arch.startswith("BiT"): base_model = BIT_MODELS[arch](head_size=-1) print("Loading pretrained model...") base_model.load_from(np.load(f"cache/pretrained/{arch}.npz")) net_final_size = base_model.width_factor * 2048 else: raise ValueError(f"arch '{arch}'' not supported") train_ds, valid_ds = get_datasets(image_folder, val_ratio=0.05) set_trainable(base_model, False) model = FirstStageLearner(base_model, train_ds, valid_ds, epochs, lr, num_gpus=num_gpus, batch_size=batch_size if batch_size else 4, image_size=IMAGE_SIZE, projection_size=256, projection_hidden_size=4096, net_final_size=net_final_size, use_momentum=False) if steps: trainer = pl.Trainer( accelerator='ddp' if num_gpus > 1 else None, amp_backend="apex", amp_level='O2', precision=16, gpus=num_gpus, val_check_interval=0.5, gradient_clip_val=10, max_steps=steps, accumulate_grad_batches=grad_accu, auto_scale_batch_size='power' if batch_size is None else None) else: trainer = pl.Trainer( accelerator='ddp' if num_gpus > 1 else None, amp_backend="apex", amp_level='O2', precision=16, gpus=num_gpus, val_check_interval=0.5, gradient_clip_val=10, max_epochs=epochs, accumulate_grad_batches=grad_accu, auto_scale_batch_size='power' if batch_size is None else None) if batch_size is None: trainer.tune(model) trainer.fit(model) if num_gpus == 1 or torch.distributed.get_rank() == 0: torch.save( { "online_encoder_proj": model.learner.online_encoder.projector.state_dict(), "online_encoder_net": model.learner.online_encoder.net.state_dict(), "online_predictor": model.learner.online_predictor.state_dict(), "target_encoder_net": model.learner.target_encoder.net.state_dict(), "target_encoder_proj": model.learner.target_encoder.projector.state_dict(), "config": { "arch": arch } }, f"cache/byol_{arch}_warmed_up.pth") print("Model saved")