def train_loop(hp, logger, writer): # make dataloader logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train) logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.MSELoss() model = Model(hp, net_arch, loss_f) if hp.load.resume_state_path is not None: model.load_training_state(logger) else: logger.info("Starting new training run.") try: for model.epoch in itertools.count(model.epoch + 1): if model.epoch > hp.train.num_iter: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer) logger.info("End of Train") except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()
def train_model(): print('#### Start Training ####') data = np.load(data_dirc+'raw_data.npy') train_data, train_label, val_data, val_label = create_data(data, RAW_LABELS, PERMUTATION, RATIO, PREPROCESS, MAX_SENTENCE_LENGTH, AUGMENTED, PADDING) train_dataset = torch.utils.data.TensorDataset(train_data, train_label) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True) val_dataset = torch.utils.data.TensorDataset(val_data, val_label) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False) file_name = 'best_model' model = CNN(num_classes=4) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model = model.to(device) # Criterion and Optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) best_acc = 0.0 for epoch in range(NUM_EPOCHS): train_loss = 0.0 for i, (data, labels) in enumerate(train_loader): model.train() data_batch, label_batch = data.to(device), labels.to(device) optimizer.zero_grad() outputs = model(data_batch) loss = criterion(outputs, label_batch) loss.backward() optimizer.step() train_loss += loss.item() # validate val_acc, val_F1 = cal_F1(val_loader, model) if val_acc > best_acc: best_acc = val_acc best_F1 = val_F1 torch.save(model.state_dict(),'saved_model/'+file_name+'.pth') train_acc = test_model(train_loader, model) train_loss /= len(train_loader.sampler) print('Epoch: [{}/{}], Step: [{}/{}], Val Acc: {}, Val F1: {}, Train Acc: {}, Train Loss: {}'.format( epoch + 1, NUM_EPOCHS, i + 1, len(train_loader), val_acc, val_F1, train_acc, train_loss)) sys.stdout.flush() print('#### End Training ####') print('best val acc:', best_acc) print('best F1:', best_F1)
def train_loop(rank, hp, world_size=1): # reload hp hp = DotDict(hp) if hp.model.device.lower() == "cuda" and world_size != 0: setup(hp, rank, world_size) if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, hp.log.log_dir) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") if hp.model.device.lower() == "cuda" and world_size != 0: hp.model.device = rank torch.cuda.set_device(rank) else: hp.model.device = hp.model.device.lower() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.MSELoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state if hp.load.resume_state_path is not None: model.load_training_state(logger) else: if logger is not None: logger.info("Starting new training run.") try: epoch_step = 1 if hp.data.divide_dataset_per_gpu else world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_iter: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer) cleanup() if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.info("Exiting due to exception: %s" % e) traceback.print_exc() cleanup()
from utils.load_model import load_model from utils.load_data import load_data from utils.test_model import test_model from utils.visualize import imshow, visualize_model dataloaders, dataset_sizes, class_names = load_data('../data') model = load_model('resnet18_01') # test_model(model, dataloaders['test']) for name, module in model.named_modules(): if isinstance(module, torch.nn.Conv2d): prune.l1_unstructured(module, name='weight', amount=0.7) # prune.l1_unstructured(module, name='bias', amount=0.2) elif isinstance(module, torch.nn.Linear): prune.l1_unstructured(module, name='weight', amount=0.4) # prune.l1_unstructured(module, name='bias', amount=0.4) for name, module in model.named_modules(): if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)): prune.remove(module, 'weight') test_model(model, dataloaders['test']) torch.save(model.state_dict(), '../models/resnet18_01_pruned')
def train_loop(rank, cfg): logger = get_logger(cfg, os.path.basename(__file__)) if cfg.device == "cuda" and cfg.dist.gpus != 0: cfg.device = rank # turn off background generator when distributed run is on cfg.data.use_background_generator = False setup(cfg, rank) torch.cuda.set_device(cfg.device) # setup writer if is_logging_process(): # set log/checkpoint dir os.makedirs(cfg.log.chkpt_dir, exist_ok=True) # set writer (tensorboard / wandb) writer = Writer(cfg, "tensorboard") cfg_str = OmegaConf.to_yaml(cfg) logger.info("Config:\n" + cfg_str) if cfg.data.train_dir == "" or cfg.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") logger.info("BackgroundGenerator is turned off when Distributed running is on") # download MNIST dataset before making dataloader # TODO: This is example code. You should change this part as you need _ = torchvision.datasets.MNIST( root=hydra.utils.to_absolute_path("dataset/meta"), train=True, transform=torchvision.transforms.ToTensor(), download=True, ) _ = torchvision.datasets.MNIST( root=hydra.utils.to_absolute_path("dataset/meta"), train=False, transform=torchvision.transforms.ToTensor(), download=True, ) # Sync dist processes (because of download MNIST Dataset) if cfg.dist.gpus != 0: dist.barrier() # make dataloader if is_logging_process(): logger.info("Making train dataloader...") train_loader = create_dataloader(cfg, DataloaderMode.train, rank) if is_logging_process(): logger.info("Making test dataloader...") test_loader = create_dataloader(cfg, DataloaderMode.test, rank) # init Model net_arch = Net_arch(cfg) loss_f = torch.nn.CrossEntropyLoss() model = Model(cfg, net_arch, loss_f, rank) # load training state / network checkpoint if cfg.load.resume_state_path is not None: model.load_training_state() elif cfg.load.network_chkpt_path is not None: model.load_network() else: if is_logging_process(): logger.info("Starting new training run.") try: if cfg.dist.gpus == 0 or cfg.data.divide_dataset_per_gpu: epoch_step = 1 else: epoch_step = cfg.dist.gpus for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > cfg.num_epoch: break train_model(cfg, model, train_loader, writer) if model.epoch % cfg.log.chkpt_interval == 0: model.save_network() model.save_training_state() test_model(cfg, model, test_loader, writer) if is_logging_process(): logger.info("End of Train") except Exception as e: if is_logging_process(): logger.error(traceback.format_exc()) else: traceback.print_exc() finally: if cfg.dist.gpus != 0: cleanup()
def train_loop(rank, hp, world_size=0): if hp.model.device == "cuda" and world_size != 0: hp.model.device = rank # turn off background generator when distributed run is on hp.data.use_background_generator = False setup(hp, rank, world_size) torch.cuda.set_device(hp.model.device) # setup logger / writer if rank != 0: logger = None writer = None else: # set logger logger = make_logger(hp) # set writer (tensorboard / wandb) writer = Writer(hp, os.path.join(hp.log.log_dir, "tensorboard")) hp_str = yaml.dump(hp.to_dict()) logger.info("Config:") logger.info(hp_str) if hp.data.train_dir == "" or hp.data.test_dir == "": logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data") logger.info("Set up train process") logger.info( "BackgroundGenerator is turned off when Distributed running is on") # download MNIST dataset before making dataloader # TODO: This is example code. You should change this part as you need _ = torchvision.datasets.MNIST( root="dataset/meta", train=True, transform=torchvision.transforms.ToTensor(), download=True, ) _ = torchvision.datasets.MNIST( root="dataset/meta", train=False, transform=torchvision.transforms.ToTensor(), download=True, ) # Sync dist processes (because of download MNIST Dataset) if world_size != 0: dist.barrier() # make dataloader if logger is not None: logger.info("Making train dataloader...") train_loader = create_dataloader(hp, DataloaderMode.train, rank, world_size) if logger is not None: logger.info("Making test dataloader...") test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size) # init Model net_arch = Net_arch(hp) loss_f = torch.nn.CrossEntropyLoss() model = Model(hp, net_arch, loss_f, rank, world_size) # load training state / network checkpoint if hp.load.resume_state_path is not None: model.load_training_state(logger) elif hp.load.network_chkpt_path is not None: model.load_network(logger=logger) else: if logger is not None: logger.info("Starting new training run.") try: if world_size == 0 or hp.data.divide_dataset_per_gpu: epoch_step = 1 else: epoch_step = world_size for model.epoch in itertools.count(model.epoch + 1, epoch_step): if model.epoch > hp.train.num_epoch: break train_model(hp, model, train_loader, writer, logger) if model.epoch % hp.log.chkpt_interval == 0: model.save_network(logger) model.save_training_state(logger) test_model(hp, model, test_loader, writer, logger) if logger is not None: logger.info("End of Train") except Exception as e: if logger is not None: logger.error(traceback.format_exc()) else: traceback.print_exc() finally: if world_size != 0: cleanup()