def main(): """ Main method. """ args = PARSER.parse_known_args()[0] # sets up the backend for distributed training (optional) device, local_rank = setup(distributed=args.distributed) # retrieve the dataloaders for the chosen dataset dataloaders, args = get_dataloaders(args) # make dirs for current experiment logs, summaries etc args = experiment_config(args) # initialise the model model = resnet.resnet20(args) # place model onto GPU(s) if args.distributed: torch.cuda.set_device(device) torch.set_num_threads(5) # n cpu threads / n processes per node model = DistributedDataParallel(model.cuda(), device_ids=[local_rank], output_device=local_rank) # only print stuff from process (rank) 0 args.print_progress = True if int( os.environ.get('RANK')) == 0 else False else: if args.half_precision: model.half() # convert to half precision for layer in model.modules(): # keep batchnorm in 32 for convergence reasons if isinstance(layer, nn.BatchNorm2d): layer.float() if torch.cuda.device_count() > 1: model = nn.DataParallel(model) print('\nUsing', torch.cuda.device_count(), 'GPU(s).\n') model.to(device) args.print_progress = True if args.print_progress: print_network(model, args) # prints out the network architecture etc logging.info('\ntrain: {} - valid: {} - test: {}'.format( len(dataloaders['train'].dataset), len(dataloaders['valid'].dataset), len(dataloaders['test'].dataset))) # launch model training or inference if not args.inference: train(model, dataloaders, args) if args.distributed: # cleanup torch.distributed.destroy_process_group() else: model.load_state_dict(torch.load(args.load_checkpoint_dir)) test_loss, test_acc = evaluate(model, args, dataloaders['test']) print('[Test] loss {:.4f} - acc {:.4f} - acc_topk {:.4f}'.format( test_loss, test_acc[0], test_acc[1]))
def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
def evaluate(args, mode, save_pred=False): _, val_loader, test_loader = get_dataloaders(args) if mode == 'val': dataloader = val_loader elif mode == 'test': dataloader = test_loader else: raise ValueError(f"{mode} not supported. Choose from 'val' or 'test'") model = UNetVgg16(n_classes=args.n_classes).to(args.device) model.load_state_dict(torch.load(args.model_path)['model_state_dict'], strict=False) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) eval_loss, scores = eval_epoch(model=model, dataloader=dataloader, n_classes=args.n_classes, criterion=criterion, device=args.device, pred_dir=save_pred and args.pred_dir) miou, acc = scores['mIoU'], scores['accuracy'] print( f"{mode} | mIoU: {miou:.3f} | accuracy: {acc:.3f} | loss: {eval_loss:.3f}" ) return scores
import sys import torch from torch import nn import argparse DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' if __name__ == '__main__': parser = argparse.ArgumentParser('nn models for inverse design') parser.add_argument('--model', type=str, default='inn') args = parser.parse_args() train_loader, val_loader, test_loader = get_dataloaders(args.model) configs = get_configs(args.model) if args.model in ['forward_model', 'inverse_model']: model = MLP(configs['input_dim'], configs['output_dim']).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay']) elif args.model in ['tandem_net']: forward_model = MLP(4, 3).to(DEVICE) forward_model.load_state_dict(torch.load('./models/forward_model.pth')['model_state_dict']) inverse_model = MLP(3, 4).to(DEVICE) inverse_model.load_state_dict(torch.load('./models/inverse_model.pth')['model_state_dict']) model = TandemNet(forward_model, inverse_model) optimizer = torch.optim.Adam(model.inverse_model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay'])
from datasets import get_dataloaders from cfg import Config cfg = Config() train_loader, val_loader, test_loader = get_dataloaders(cfg)
def main(): """ Main """ # Arguments args = parser.parse_args() # Setup Distributed Training device, local_rank = setup(distributed=args.distributed) # Get Dataloaders for Dataset of choice dataloaders, args = get_dataloaders(args) # Setup logging, saving models, summaries args = experiment_config(parser, args) # Get available models from /model/network.py model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) # If model exists if any(args.model in model_name for model_name in model_names): # Load model base_encoder = getattr(models, args.model)( args, num_classes=args.n_classes) # Encoder proj_head = models.projection_MLP(args) sup_head = models.Sup_Head(args) else: raise NotImplementedError("Model Not Implemented: {}".format( args.model)) # Remove last FC layer from resnet base_encoder.fc = nn.Sequential() # Place model onto GPU(s) if args.distributed: torch.cuda.set_device(device) torch.set_num_threads(6) # n cpu threads / n processes per node base_encoder = DistributedDataParallel(base_encoder.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, broadcast_buffers=False) proj_head = DistributedDataParallel(proj_head.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, broadcast_buffers=False) sup_head = DistributedDataParallel(sup_head.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, broadcast_buffers=False) # Only print from process (rank) 0 args.print_progress = True if int( os.environ.get('RANK')) == 0 else False else: # If non Distributed use DataParallel if torch.cuda.device_count() > 1: base_encoder = nn.DataParallel(base_encoder) proj_head = nn.DataParallel(proj_head) sup_head = nn.DataParallel(sup_head) print('\nUsing', torch.cuda.device_count(), 'GPU(s).\n') base_encoder.to(device) proj_head.to(device) sup_head.to(device) args.print_progress = True # Print Network Structure and Params if args.print_progress: print_network(base_encoder, args) # prints out the network architecture etc logging.info('\npretrain/train: {} - valid: {} - test: {}'.format( len(dataloaders['train'].dataset), len(dataloaders['valid'].dataset), len(dataloaders['test'].dataset))) # launch model training or inference if not args.finetune: ''' Pretraining / Finetuning / Evaluate ''' if not args.supervised: # Pretrain the encoder and projection head proj_head.apply(init_weights) pretrain(base_encoder, proj_head, dataloaders, args) else: supervised(base_encoder, sup_head, dataloaders, args) print("\n\nLoading the model: {}\n\n".format(args.load_checkpoint_dir)) # Load the pretrained model checkpoint = torch.load(args.load_checkpoint_dir) # Load the encoder parameters base_encoder.load_state_dict(checkpoint['encoder']) # Initalize weights of the supervised / classification head sup_head.apply(init_weights) # Supervised Finetuning of the supervised classification head finetune(base_encoder, sup_head, dataloaders, args) # Evaluate the pretrained model and trained supervised head test_loss, test_acc, test_acc_top5 = evaluate(base_encoder, sup_head, dataloaders, 'test', args.finetune_epochs, args) print('[Test] loss {:.4f} - acc {:.4f} - acc_top5 {:.4f}'.format( test_loss, test_acc, test_acc_top5)) if args.distributed: # cleanup torch.distributed.destroy_process_group() else: ''' Finetuning / Evaluate ''' # Do not Pretrain, just finetune and inference print("\n\nLoading the model: {}\n\n".format(args.load_checkpoint_dir)) # Load the pretrained model checkpoint = torch.load(args.load_checkpoint_dir) # Load the encoder parameters base_encoder.load_state_dict(checkpoint['encoder']) # .cuda() # Initalize weights of the supervised / classification head sup_head.apply(init_weights) # Supervised Finetuning of the supervised classification head finetune(base_encoder, sup_head, dataloaders, args) # Evaluate the pretrained model and trained supervised head test_loss, test_acc, test_acc_top5 = evaluate(base_encoder, sup_head, dataloaders, 'test', args.finetune_epochs, args) print('[Test] loss {:.4f} - acc {:.4f} - acc_top5 {:.4f}'.format( test_loss, test_acc, test_acc_top5)) if args.distributed: # cleanup torch.distributed.destroy_process_group()
if __name__ == "__main__": # parse args parser = util.get_train_parser() args = parser.parse_args() if args.seed is not None: if args.device == 'cuda': use_cuda=True elif args.device == 'cpu': use_cuda=False util.random_seed(seed_value=args.seed, use_cuda=use_cuda) # load in data generator dataloaders = datasets.get_dataloaders(args.country, args.dataset, args) # load in model model = models.get_model(**vars(args)) if args.model_name in DL_MODELS: print('Total trainable model parameters: {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad))) if args.model_path is not None: model.load_state_dict(torch.load(args.model_path)) if args.model_name in DL_MODELS and args.device == 'cuda' and torch.cuda.is_available(): model.to(args.device) if args.name is None: args.name = str(datetime.datetime.now()) + "_" + args.model_name
hps[hp] = [] experiments = {} # for some number of iterations for sample_no in range(search_range.num_samples): # build argparse args by parsing args and then setting empty fields to specified ones above train_parser = util.get_train_parser() train_args = train_parser.parse_args([ '--model_name', search_range.model_name, '--dataset', search_range.dataset, '--env_name', search_range.env_name, '--country', search_range.country ]) generate_hps(train_args, search_range) train_args.epochs = search_range.epochs dataloaders = datasets.get_dataloaders(train_args.country, train_args.dataset, train_args) model = models.get_model(**vars(train_args)) model.to(train_args.device) experiment_name = f"model:{train_args.model_name}_dataset:{train_args.dataset}_epochs:{search_range.epochs}_sample_no:{sample_no}" train_args.name = experiment_name print("=" * 100) print(f"TRAINING: {experiment_name}") for hp in hps: print(hp, train_args.__dict__[hp]) try: train.train(model, train_args.model_name, train_args, dataloaders=dataloaders)
def main(): """ Main """ # Arguments args = parser.parse_args() # Setup Distributed Training device, local_rank = setup(distributed=args.distributed) # Get Dataloaders for Dataset of choice dataloaders, args = get_dataloaders(args) # Setup logging, saving models, summaries args = experiment_config(parser, args) ''' Base Encoder ''' # Get available models from /model/network.py model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) # If model exists if any(args.model in model_name for model_name in model_names): # Load model base_encoder = getattr(models, args.model)( args, num_classes=args.n_classes) # Encoder else: raise NotImplementedError("Model Not Implemented: {}".format( args.model)) if not args.supervised: # freeze all layers but the last fc for name, param in base_encoder.named_parameters(): if name not in ['fc.weight', 'fc.bias']: param.requires_grad = False # init the fc layer init_weights(base_encoder) ''' MoCo Model ''' moco = MoCo_Model(args, queue_size=args.queue_size, momentum=args.queue_momentum, temperature=args.temperature) # Place model onto GPU(s) if args.distributed: torch.cuda.set_device(device) torch.set_num_threads(6) # n cpu threads / n processes per node moco = DistributedDataParallel(moco.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, broadcast_buffers=False) base_encoder = DistributedDataParallel(base_encoder.cuda(), device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, broadcast_buffers=False) # Only print from process (rank) 0 args.print_progress = True if int( os.environ.get('RANK')) == 0 else False else: # If non Distributed use DataParallel if torch.cuda.device_count() > 1: moco = nn.DataParallel(moco) base_encoder = nn.DataParallel(base_encoder) print('\nUsing', torch.cuda.device_count(), 'GPU(s).\n') moco.to(device) base_encoder.to(device) args.print_progress = True # Print Network Structure and Params if args.print_progress: print_network(moco, args) # prints out the network architecture etc logging.info('\npretrain/train: {} - valid: {} - test: {}'.format( len(dataloaders['train'].dataset), len(dataloaders['valid'].dataset), len(dataloaders['test'].dataset))) # launch model training or inference if not args.finetune: ''' Pretraining / Finetuning / Evaluate ''' if not args.supervised: # Pretrain the encoder and projection head pretrain(moco, dataloaders, args) # Load the state_dict from query encoder and load it on finetune net base_encoder = load_moco(base_encoder, args) else: supervised(base_encoder, dataloaders, args) # Load the state_dict from query encoder and load it on finetune net base_encoder = load_sup(base_encoder, args) # Supervised Finetuning of the supervised classification head finetune(base_encoder, dataloaders, args) # Evaluate the pretrained model and trained supervised head test_loss, test_acc, test_acc_top5 = evaluate(base_encoder, dataloaders, 'test', args.finetune_epochs, args) print('[Test] loss {:.4f} - acc {:.4f} - acc_top5 {:.4f}'.format( test_loss, test_acc, test_acc_top5)) if args.distributed: # cleanup torch.distributed.destroy_process_group() else: ''' Finetuning / Evaluate ''' # Do not Pretrain, just finetune and inference # Load the state_dict from query encoder and load it on finetune net base_encoder = load_moco(base_encoder, args) # Supervised Finetuning of the supervised classification head finetune(base_encoder, dataloaders, args) # Evaluate the pretrained model and trained supervised head test_loss, test_acc, test_acc_top5 = evaluate(base_encoder, dataloaders, 'test', args.finetune_epochs, args) print('[Test] loss {:.4f} - acc {:.4f} - acc_top5 {:.4f}'.format( test_loss, test_acc, test_acc_top5)) if args.distributed: # cleanup torch.distributed.destroy_process_group()
def main(): """ Main method. """ args = PARSER.parse_known_args()[0] if args.extract_representation == True: if args.inference == False: print( 'Error, to extract_representation, add "--inference" to the program call' ) return if args.distributed == False: if args.visible_gpus != 'all': os.environ['CUDA_VISIBLE_DEVICES'] = str(args.visible_gpus) # sets up the backend for distributed training (optional) device, local_rank = setup(distributed=args.distributed) # retrieve the dataloaders for the chosen dataset dataloaders, args = get_dataloaders(args) # make dirs for current experiment logs, summaries etc args = experiment_config(args) # initialise the model # model = resnet.resnet32(args) model = models4finetuning.initialize_model( model_name=args.use_net, num_classes=2, feature_extract=args.as_feature_extractor, use_pretrained=args.use_pretrained) ## pretrained models available [resnet, alexnet, vgg, squeezenet, densenet, inception] # place model onto GPU(s) if args.distributed: torch.cuda.set_device(device) torch.set_num_threads(1) # n cpu threads / n processes per node model = DistributedDataParallel(model.cuda(), device_ids=[local_rank], output_device=local_rank) # only print stuff from process (rank) 0 args.print_progress = True if int( os.environ.get('RANK')) == 0 else False else: if args.half_precision: model.half() # convert to half precision for layer in model.modules(): # keep batchnorm in 32 for convergence reasons if isinstance(layer, nn.BatchNorm2d): layer.float() if torch.cuda.device_count() > 1: model = nn.DataParallel(model) print('\nUsing', torch.cuda.device_count(), 'GPU(s).\n') model.to(device) args.print_progress = True if args.print_progress: print_network(model, args) # prints out the network architecture etc logging.info('\ntrain: {} - valid: {} - test: {}'.format( len(dataloaders['train'].dataset), len(dataloaders['valid'].dataset), len(dataloaders['test'].dataset))) # launch model training or inference if not args.inference: train(model, dataloaders, args) if args.distributed: # cleanup torch.distributed.destroy_process_group() else: model.load_state_dict(torch.load(args.load_checkpoint_dir)) if args.extract_representation == True: test_loss, test_acc, test_sn, test_sp, test_ppv, test_f1score, all_intermediate_features, all_labels, all_predictions, all_output_activations = evaluate( model, dataloaders['test'], args) name_to_save = '/'.join( args.summaries_dir.split('/')[:-1]) + '/extracted_features.mat' savemat( name_to_save, { 'features': all_intermediate_features, 'labels': all_labels, 'predictions': all_predictions, 'activations': all_output_activations }) else: test_loss, test_acc, test_sn, test_sp, test_ppv, test_f1score = evaluate( model, dataloaders['test'], args) logging.info( f'[Test] loss {test_loss:.4f} - acc: {test_acc[0]:.4f} - sn: {test_sn:.4f} - sp: {test_sp:.4f} - ppv: {test_ppv:.4f} - F1: {test_f1score:.4f}' )