def main(cfg, _log): init_seed(cfg.seed) _log.info("=> fetching img pairs.") train_set, valid_set = get_dataset(cfg) _log.info('{} samples found, {} train samples and {} test samples '.format( len(valid_set) + len(train_set), len(train_set), len(valid_set))) train_loader = torch.utils.data.DataLoader(train_set, batch_size=cfg.train.batch_size, num_workers=cfg.train.workers, pin_memory=True, shuffle=True) max_test_batch = 4 if type(valid_set) is torch.utils.data.ConcatDataset: valid_loader = [ torch.utils.data.DataLoader(s, batch_size=min(max_test_batch, cfg.train.batch_size), num_workers=min(4, cfg.train.workers), pin_memory=True, shuffle=False) for s in valid_set.datasets ] valid_size = sum([len(l) for l in valid_loader]) else: valid_loader = torch.utils.data.DataLoader( valid_set, batch_size=min(max_test_batch, cfg.train.batch_size), num_workers=min(4, cfg.train.workers), pin_memory=True, shuffle=False) valid_size = len(valid_loader) if cfg.train.epoch_size == 0: cfg.train.epoch_size = len(train_loader) if cfg.train.valid_size == 0: cfg.train.valid_size = valid_size cfg.train.epoch_size = min(cfg.train.epoch_size, len(train_loader)) cfg.train.valid_size = min(cfg.train.valid_size, valid_size) model = get_model(cfg.model) loss = get_loss(cfg.loss) trainer = get_trainer(cfg.trainer)(train_loader, valid_loader, model, loss, _log, cfg.save_root, cfg.train) for name, param in model.named_parameters(): if ("pyramid" in name) == False: param.requires_grad = False else: print(name, param.requires_grad) #parameter.requires_grad = False epoch, weights = load_checkpoint('checkpoints/Sintel/pwclite_ar.tar') print("traiiiiiiiiiiiiiiiiiiiiiiiiiiiiin", weights) trainer.model = model trainer.train()
def main(cfg, _log): init_seed(cfg.seed) _log.info("=> fetching img pairs.") train_set, valid_set = get_dataset(cfg) _log.info('{} samples found, {} train samples and {} test samples '.format( len(valid_set) + len(train_set), len(train_set), len(valid_set))) train_loader = torch.utils.data.DataLoader(train_set, batch_size=cfg.train.batch_size, num_workers=cfg.train.workers, pin_memory=True, shuffle=True) max_test_batch = 4 if type(valid_set) is torch.utils.data.ConcatDataset: valid_loader = [ torch.utils.data.DataLoader(s, batch_size=min(max_test_batch, cfg.train.batch_size), num_workers=min(4, cfg.train.workers), pin_memory=True, shuffle=False) for s in valid_set.datasets ] valid_size = sum([len(l) for l in valid_loader]) else: valid_loader = torch.utils.data.DataLoader( valid_set, batch_size=min(max_test_batch, cfg.train.batch_size), num_workers=min(4, cfg.train.workers), pin_memory=True, shuffle=False) valid_size = len(valid_loader) if cfg.train.epoch_size == 0: cfg.train.epoch_size = len(train_loader) if cfg.train.valid_size == 0: cfg.train.valid_size = valid_size cfg.train.epoch_size = min(cfg.train.epoch_size, len(train_loader)) cfg.train.valid_size = min(cfg.train.valid_size, valid_size) model = get_model(cfg.model) loss = get_loss(cfg.loss) trainer = get_trainer(cfg.trainer)(train_loader, valid_loader, model, loss, _log, cfg.save_root, cfg.train) trainer.train()
def worker(id, cfg, shared): # init logger curr_time = datetime.datetime.now().strftime("%y%m%d%H%M%S") _log = init_logger(log_dir=cfg.save_root, filename=curr_time[6:] + '.log') if id == 0: _log.info(id, '=> will save everything to {}'.format(cfg.save_root)) # show configurations cfg_str = pprint.pformat(cfg) if id == 0: _log.info(id, '=> configurations \n ' + cfg_str) # Distributed if cfg.mp.enabled: if cfg.train.n_gpu > 0: dist.init_process_group(backend="nccl", init_method="env://", world_size=cfg.mp.workers, rank=id) else: dist.init_process_group(backend="gloo", init_method="env://", world_size=cfg.mp.workers, rank=id) # Get Model and Loss model = get_model(cfg, id) loss = get_loss(cfg, id) # Create Trainer trainer = get_trainer(cfg)(id, model, loss, _log, cfg.save_root, cfg, shared) # Train or Test try: if cfg.eval: trainer.eval() else: trainer.train() except Exception as e: import traceback traceback.print_exc() # Destroy if cfg.mp.enabled: dist.destroy_process_group()
def __init__(self, n_class, batch_size, instances, embedding_size=128, pretrained=True): super(NDfdml, self).__init__() device = torch.device("cuda:0") self.batch_size = batch_size self.embedding_size = embedding_size self.instances = instances assert batch_size % instances == 0 self.n_class = batch_size // instances self.googlelayer = get_feature(pretrained).to(device) self.embedding_layer = get_embedding( dim=1000, embedding_size=embedding_size).to(device) self.dataset_metricloss = get_loss('Triplet') self.loss_fn = nd_loss.weight_nd_Loss(self.batch_size, self.instances)
def train(self): since = time.time() start = time.time() self.scheduler.step() self.model.train() running_iter = 0 running_loss = 0.0 running_count = 0 running_epoch = 0 pred_time = 0.0 opt_time = 0.0 load_time = 0.0 print('Start training') while running_iter < self.iteration: if self.cm and running_epoch % self.update_epoch == 0: embeddings, labels, spend = self.feed_embeddings('mean') centers = class_centers(embeddings, labels) self.cm_sampler.update_centers(centers, running_epoch) start += spend # Train an epoch t0 = time.time() for sample in self.data_loaders['train']: if running_iter > self.iteration: break inputs = sample['image'].to(self.device) labels = sample['label'].to(self.device) self.optimizer.zero_grad() with torch.set_grad_enabled(True): t1 = time.time() outputs = self.model(inputs) t2 = time.time() loss_fn = get_loss(self.method) loss, count = loss_fn(outputs, labels) loss.backward() t3 = time.time() self.optimizer.step() load_time += t1 - t0 pred_time += t2 - t1 opt_time += t3 - t2 running_loss += loss.item() / count running_count += count if (running_iter + 1) % self.show_iter == 0: print( 'Iteration {}/{} Loss {:.4f} Triplets: {:.0f} Spending {:.0f}s' .format(running_iter + 1, self.iteration, running_loss / self.show_iter, running_count / self.show_iter, time.time() - start)) running_loss = 0.0 running_count = 0 start = time.time() running_iter += 1 running_epoch += 1 time_elapsed = time.time() - since print('pred:{:.0f} opt:{:.0f} load:{:.0f}'.format( pred_time, opt_time, load_time)) print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) return self.model