def get_embeddings(self, dataloader, evaluate=True): args = self.args self.model.eval() local_step = 0 push_to_cpu_steps = 32 idxs_list = [] embeds_list = [] master_idxs_list = [] master_embeds_list = [] def _synchronize_lists(_embeds_list, _idxs_list): gathered_data = all_gather({ 'embeds_list': _embeds_list, 'idxs_list': _idxs_list, }) if get_rank() == 0: _embeds_list = [d['embeds_list'] for d in gathered_data] _embeds_list = flatten(_embeds_list) _embeds_list = [x.cpu() for x in _embeds_list] _idxs_list = [d['idxs_list'] for d in gathered_data] _idxs_list = flatten(_idxs_list) _idxs_list = [x.cpu() for x in _idxs_list] master_embeds_list.extend(_embeds_list) master_idxs_list.extend(_idxs_list) synchronize() return [], [] batch_iterator = tqdm(dataloader, desc='Getting embeddings...', disable=(not evaluate or get_rank() != 0 or args.disable_logging)) for batch in batch_iterator: batch = tuple(t.to(args.device, non_blocking=True) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'concat_input': False } embeds_list.append(self.model(**inputs)) idxs_list.append(batch[0]) local_step += 1 if local_step % push_to_cpu_steps == 0: embeds_list, idxs_list = _synchronize_lists( embeds_list, idxs_list) embeds_list, idxs_list = _synchronize_lists(embeds_list, idxs_list) idxs, embeds = None, None if get_rank() == 0: idxs = torch.cat(master_idxs_list, dim=0).numpy() idxs, indices = np.unique(idxs, return_index=True) embeds = torch.cat(master_embeds_list, dim=0).numpy() embeds = embeds[indices] synchronize() return idxs, embeds
def metric_3D(self, model, cfg): p_json = cfg.DATASET.TEST_PERSON_LIST datadir_4D = "/root/ACDC_DataSet/4dData" with open(p_json, "r") as f: persons = json.load(f) total_segMetrics = {"dice": [[], [], []], "hausdorff": [[], [], []]} for i, p in enumerate(persons): # imgs, gts = personTo4Ddata(p, val_list) if p in self.caches_4D.keys(): imgs, gts = self.caches_4D[p] else: imgs = np.load( os.path.join(datadir_4D, p.split('-')[1], '4d_data.npy')) gts = np.load( os.path.join(datadir_4D, p.split('-')[1], '4d_gt.npy')) self.caches_4D[p] = [imgs, gts] imgs, gts = imgs.astype(np.float32)[..., None, :], gts.astype( np.float32)[..., None, :] imgs, gts = joint_transform(imgs, gts, cfg) gts = [gt[:, 0, ...].numpy() for gt in gts] preds = test_person( model, imgs, multi_batches=True, used_df=cfg.DATASET.DF_USED) # (times, slices, H, W) segMetrics = {"dice": [], "hausdorff": []} for j in range(len(preds)): segMetrics["dice"].append( metrics.dice3D(preds[j], gts[j], gts[j].shape)) segMetrics["hausdorff"].append(metrics.hd_3D(preds[j], gts[j])) for k, v in segMetrics.items(): segMetrics[k] = np.array(v).reshape((-1, 3)) for k, v in total_segMetrics.items(): for j in range(3): total_segMetrics[k][j] += segMetrics[k][:, j].tolist() # person i is done if get_rank() == 0: print("\r{}/{} {:.0%}\r".format(i, len(persons), i / len(persons)), end='') if get_rank() == 0: print() mean = {} for k, v in total_segMetrics.items(): mean.update({"LV_" + k: np.mean(v[1])}) mean.update({"MYO_" + k: np.mean(v[2])}) mean.update({"RV_" + k: np.mean(v[0])}) return mean
def __init__(self, filename, benchmark, organization): self.mllogger = mllog.get_mllogger() self.comm_rank = comm.get_rank() self.comm_size = comm.get_size() self.constants = constants # create logging dir if it does not exist logdir = os.path.dirname(filename) if self.comm_rank == 0: if not os.path.isdir(logdir): os.makedirs(logdir) if torch.distributed.is_available( ) and torch.distributed.is_initialized(): torch.distributed.barrier() # create config mllog.config(filename=filename) self.mllogger.logger.propagate = False self.log_event(key=constants.SUBMISSION_BENCHMARK, value=benchmark) self.log_event(key=constants.SUBMISSION_ORG, value=organization) self.log_event(key=constants.SUBMISSION_DIVISION, value='closed') self.log_event(key=constants.SUBMISSION_STATUS, value='onprem') self.log_event( key=constants.SUBMISSION_PLATFORM, value=f'{self.comm_size}xSUBMISSION_PLATFORM_PLACEHOLDER')
def main(): args = parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://", ) comm.synchronize() cfg = get_default_cfg() if args.config_file: cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.output_dir if output_dir: misc.mkdir(output_dir) logger = setup_logger("EfficientDet", output_dir, comm.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(output_dir, 'config.yml') logger.info("Saving config into: {}".format(output_config_path)) misc.save_config(cfg, output_config_path) model = train(cfg, args.local_rank, args.distributed)
def compute_scores_for_inference(self, clusters_mx, per_example_negs): # TODO: add description here args = self.args # get all of the unique examples examples = clusters_mx.data.tolist() examples.extend(flatten(per_example_negs.tolist())) examples = unique(examples) examples = list(filter(lambda x: x >= 0, examples)) # create dataset and dataloader dataset = InferenceEmbeddingDataset(args, examples, args.train_cache_dir) dataloader = InferenceEmbeddingDataLoader(args, dataset) # get the unique idxs and embeds for each idx idxs, embeds = self.get_embeddings(dataloader, evaluate=False) sparse_graph = None if get_rank() == 0: # create inverse index for mapping inverse_idxs = {v: k for k, v in enumerate(idxs)} ## make the list of pairs of dot products we need _row = clusters_mx.row # positives: local_pos_a, local_pos_b = np.where( np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1)) pos_a = clusters_mx.data[local_pos_a] pos_b = clusters_mx.data[local_pos_b] # negatives: local_neg_a = np.tile( np.arange(per_example_negs.shape[0])[:, np.newaxis], (1, per_example_negs.shape[1])).flatten() neg_a = clusters_mx.data[local_neg_a] neg_b = per_example_negs.flatten() neg_mask = (neg_b != -1) neg_a = neg_a[neg_mask] neg_b = neg_b[neg_mask] # create subset of the sparse graph we care about a = np.concatenate((pos_a, neg_a), axis=0) b = np.concatenate((pos_b, neg_b), axis=0) edges = list(zip(a, b)) affinities = [ np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]]) for i, j in edges ] # convert to coo_matrix edges = np.asarray(edges).T affinities = np.asarray(affinities) _sparse_num = np.max(edges) + 1 sparse_graph = coo_matrix((affinities, edges), shape=(_sparse_num, _sparse_num)) synchronize() return sparse_graph
def train(cfg, local_rank, distributed): num_classes = COCODataset(cfg.data.train[0], cfg.data.train[1]).num_classes model = EfficientDet(num_classes=num_classes, model_name=cfg.model.name) inp_size = model.config['inp_size'] device = torch.device(cfg.device) model.to(device) optimizer = build_optimizer(model, **optimizer_kwargs(cfg)) lr_scheduler = build_lr_scheduler(optimizer, **lr_scheduler_kwargs(cfg)) use_mixed_precision = cfg.dtype == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True) arguments = {} arguments["iteration"] = 0 output_dir = cfg.output_dir save_to_disk = comm.get_rank() == 0 checkpointer = Checkpointer(model, optimizer, lr_scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.model.resume) arguments.update(extra_checkpoint_data) train_dataloader = build_dataloader(cfg, inp_size, is_train=True, distributed=distributed, start_iter=arguments["iteration"]) test_period = cfg.test.test_period if test_period > 0: val_dataloader = build_dataloader(cfg, inp_size, is_train=False, distributed=distributed) else: val_dataloader = None checkpoint_period = cfg.solver.checkpoint_period log_period = cfg.solver.log_period do_train(cfg, model, train_dataloader, val_dataloader, optimizer, lr_scheduler, checkpointer, device, checkpoint_period, test_period, log_period, arguments) return model
def get_edge_affinities(self, edges, example_dir, knn_index): if get_rank() == 0: idxs, embeds = knn_index.idxs, knn_index.X inverse_idxs = {v: k for k, v in enumerate(idxs)} affinities = [ np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]]) for i, j in edges ] return affinities
def train(cfg, local_rank, distributed, logger=None, tblogger=None, transfer_weight=False, change_lr=False): device = torch.device('cuda') # create model logger.info('Creating model "{}"'.format(cfg.MODEL.ARCHITECTURE)) model = build_model(cfg).to(device) criterion = torch.nn.CrossEntropyLoss(ignore_index=255).to(device) optimizer = make_optimizer(cfg, model) # model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O2') scheduler = make_lr_scheduler(cfg, optimizer) if distributed: # model = apex.parallel.DistributedDataParallel(model) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=True, ) save_to_disk = get_rank() == 0 # checkpoint arguments = {} arguments['iteration'] = 0 arguments['best_iou'] = 0 checkpointer = Checkpointer(model, optimizer, scheduler, cfg.LOGS.DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load( f=cfg.MODEL.WEIGHT, model_weight_only=transfer_weight, change_scheduler=change_lr) arguments.update(extra_checkpoint_data) # data_loader logger.info('Loading dataset "{}"'.format(cfg.DATASETS.TRAIN)) data_loader = make_data_loader(cfg, 'train', distributed) data_loader_val = make_data_loader(cfg, 'val', distributed) do_train(cfg, model=model, data_loader=data_loader, optimizer=optimizer, scheduler=scheduler, criterion=criterion, checkpointer=checkpointer, device=device, arguments=arguments, tblogger=tblogger, data_loader_val=data_loader_val, distributed=distributed)
def train_step(self, batch): args = self.args # get the batch of clusters and approx negs for each individual example clusters_mx, per_example_negs = batch # compute scores using up-to-date model #sparse_graph = self.embed_sub_trainer.compute_scores_for_inference( # clusters_mx, per_example_negs) #sparse_graph = self._build_temp_sparse_graph( # clusters_mx, per_example_negs) # TODO: produce sparse graph w/ concat model in inference mode sparse_graph = self.concat_sub_trainer.compute_scores_for_inference( clusters_mx, per_example_negs) # create custom datasets for training embed_dataset_list = None concat_dataset_list = None dataset_metrics = None if get_rank() == 0: dataset_lists, dataset_metrics = self.dataset_builder( clusters_mx, sparse_graph, self.train_metadata) embed_dataset_list, concat_dataset_list = dataset_lists dataset_metrics = broadcast(dataset_metrics, src=0) embed_dataset_list = broadcast(embed_dataset_list, src=0) concat_dataset_list = broadcast(concat_dataset_list, src=0) # take care of empty dataset list (should only happen when only considering m-m edges) if embed_dataset_list == None or concat_dataset_list == None: return {} ## train on datasets #embed_return_dict = self.embed_sub_trainer.train_on_subset( # embed_dataset_list, self.train_metadata #) concat_return_dict = self.concat_sub_trainer.train_on_subset( concat_dataset_list, self.train_metadata) #embed_return_dict = broadcast(embed_return_dict, src=0) concat_return_dict = broadcast(concat_return_dict, src=0) return_dict = {} return_dict.update(dataset_metrics) #return_dict.update(embed_return_dict) return_dict.update(concat_return_dict) #if get_rank() == 0: # embed() #synchronize() #exit() return return_dict
def _train_softmax(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] self.model.train() self.model.zero_grad() criterion = nn.CrossEntropyLoss() for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = SoftmaxEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'concat_input': False } outputs = self.model(**inputs) pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] * outputs[:, 1:, :], dim=-1) target = torch.zeros(pos_neg_dot_prods.shape[0], dtype=torch.long).cuda() loss = criterion(pos_neg_dot_prods, target) losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'losses': losses, }) if get_rank() == 0: losses = flatten([d['losses'] for d in gathered_data]) loss = np.mean(losses) synchronize() return {'embed_loss': loss} else: synchronize() return None
def eval_epoch(self, d_loader): self.model.eval() eval_dict = {} total_loss = 0 # eval one epoch if get_rank() == 0: print("evaluating...") sel_num = np.random.choice(len(d_loader), size=1) for i, data in enumerate(d_loader, 0): self.optimizer.zero_grad() vis = True if i == sel_num else False loss, tb_dict, disp_dict = self.model_fn_eval(self.model, data, self.criterion, perfermance=True, vis=vis) total_loss += loss.item() for k, v in tb_dict.items(): if "vis" not in k: eval_dict[k] = eval_dict.get(k, 0) + v else: eval_dict[k] = v if get_rank() == 0: print("\r{}/{} {:.0%}\r".format(i, len(d_loader), i / len(d_loader)), end='') if get_rank() == 0: print() for k, v in tb_dict.items(): if "vis" not in k: eval_dict[k] = eval_dict.get(k, 0) / (i + 1) return total_loss / (i + 1), eval_dict, disp_dict
def _train_threshold(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] self.model.train() self.model.zero_grad() random.shuffle(dataset_list) for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = ScaledPairsEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[2], 'attention_mask': batch[3], 'token_type_ids': batch[4], 'concat_input': False } outputs = self.model(**inputs) dot_prods = torch.sum(outputs[:, 0, :] * outputs[:, 1, :], dim=-1) loss = torch.mean(F.relu(args.margin - (batch[0] * dot_prods))) losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'losses': losses, }) if get_rank() == 0: losses = flatten([d['losses'] for d in gathered_data]) loss = np.mean(losses) synchronize() return {'embed_loss': loss} else: synchronize() return None
def eval_epoch(self, d_loader): self.model.eval() eval_dict = {} # total_loss = 0 # eval one epoch if get_rank() == 0: print("evaluating...") for i, data in enumerate(d_loader, 0): self.optimizer.zero_grad() _, tb_dict, disp_dict = self.model_fn_eval(self.model, data, self.criterion, perfermance=True) # total_loss += loss.item() # removed total loss for k, v in tb_dict.items(): eval_dict[k] = eval_dict.get(k, 0) + v #key -- 字典中要查找的键,default -- 如果指定键的值不存在时,返回该默认值。 if get_rank() == 0: print("\r{}/{} {:.0%}\r".format((i+1), len(d_loader), (i+1)/len(d_loader)), end='') if get_rank() == 0: print() for k, v in tb_dict.items(): eval_dict[k] = eval_dict.get(k, 0) / (i + 1) return _, eval_dict, disp_dict # remove total_loss / (i+1)
def _synchronize_lists(_embeds_list, _idxs_list): gathered_data = all_gather({ 'embeds_list': _embeds_list, 'idxs_list': _idxs_list, }) if get_rank() == 0: _embeds_list = [d['embeds_list'] for d in gathered_data] _embeds_list = flatten(_embeds_list) _embeds_list = [x.cpu() for x in _embeds_list] _idxs_list = [d['idxs_list'] for d in gathered_data] _idxs_list = flatten(_idxs_list) _idxs_list = [x.cpu() for x in _idxs_list] master_embeds_list.extend(_embeds_list) master_idxs_list.extend(_idxs_list) synchronize() return [], []
def _build_temp_sparse_graph(self, clusters_mx, per_example_negs): args = self.args # get all of the unique examples examples = clusters_mx.data.tolist() examples.extend(flatten(per_example_negs.tolist())) examples = unique(examples) examples = list(filter(lambda x: x >= 0, examples)) sparse_graph = None if get_rank() == 0: ## make the list of pairs of dot products we need _row = clusters_mx.row # positives: local_pos_a, local_pos_b = np.where( np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1)) pos_a = clusters_mx.data[local_pos_a] pos_b = clusters_mx.data[local_pos_b] # negatives: local_neg_a = np.tile( np.arange(per_example_negs.shape[0])[:, np.newaxis], (1, per_example_negs.shape[1])).flatten() neg_a = clusters_mx.data[local_neg_a] neg_b = per_example_negs.flatten() neg_mask = (neg_b != -1) neg_a = neg_a[neg_mask] neg_b = neg_b[neg_mask] # create subset of the sparse graph we care about a = np.concatenate((pos_a, neg_a), axis=0) b = np.concatenate((pos_b, neg_b), axis=0) edges = list(zip(a, b)) affinities = [0.0 for i, j in edges] # convert to coo_matrix edges = np.asarray(edges).T affinities = np.asarray(affinities) _sparse_num = np.max(edges) + 1 sparse_graph = coo_matrix((affinities, edges), shape=(_sparse_num, _sparse_num)) synchronize() return sparse_graph
def detection_inference(cfg, model, data_loader_val, device, iteration, summary_writer=None, logger=None, visualize=False, fppi=(0.1, 0.01)): mAP = MeanOfAveragePrecision(cfg.DATASET.NUM_CLASS, cfg.DATASET.MAX_OBJECTS, fppi=fppi) bar = TqdmBar(data_loader_val, 0, get_rank(), data_loader_val.__len__(), description='Inference', use_bar=cfg.USE_BAR) for iteration, record in bar.bar: record = move_to_device(record, device) prediction = model(record) prediction = prediction.cpu().detach() record = move_to_device(record, torch.device('cpu')) mAP.calculate_overlaps(record, prediction) if visualize: # TODO vis mod pass bar.close() mAP_5095, mAP_50, m_recall = mAP.calculate_map() if logger is not None: logger.info('====================================================================================') # logger.info('Average inference time per image without post process is: %s' % ( # sum(model.inference_time_without_postprocess) / max(len(model.inference_time_without_postprocess), # np.finfo(np.float64).eps))) # logger.info('Average inference time per image with post process is: %s' % ( # sum(model.inference_time_with_postprocess) / max(len(model.inference_time_with_postprocess), # np.finfo(np.float64).eps))) logger.info('mAP(@iou=0.5:0.95): %s' % mAP_5095) logger.info('mAP(@iou=0.5): %s' % mAP_50) logger.info('Recall(@iou=0.5, @fppi=%s): %s' % (fppi[0], m_recall[0])) logger.info('Recall(@iou=0.5, @fppi=%s): %s' % (fppi[1], m_recall[1])) logger.info('====================================================================================') if summary_writer is not None: record = {'mAP_iou_0.5_0.95': mAP_5095, 'mAP_iou_0.5': mAP_50, 'Recall_iou_0.5_fppi_{}'.format(fppi[0]): m_recall[0], 'Recall_iou_0.5_fppi_{}'.format(fppi[1]): m_recall[1]} write_summary(summary_writer, iteration, record=record, group='Evaluations')
def compute_on_dataset_1stage(model, data_loader, device): # single stage inference, for model without memory features cpu_device = torch.device("cpu") results_dict = {} if get_world_size() == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) for batch in tqdm(data_loader, **extra_args): slow_clips, fast_clips, boxes, objects, extras, video_ids = batch slow_clips = slow_clips.to(device) fast_clips = fast_clips.to(device) boxes = [box.to(device) for box in boxes] objects = [None if (box is None) else box.to(device) for box in objects] with torch.no_grad(): output = model(slow_clips, fast_clips, boxes, objects, extras) output = [o.to(cpu_device) for o in output] results_dict.update( {video_id: result for video_id, result in zip(video_ids, output)} ) return results_dict
def train(cfg, local_rank, distributed): logger = logging.getLogger(cfg.NAME) # build model model = build_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # build solver optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {"iteration": 0} save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) save_to_disk = get_rank() == 0 checkpointer = Checkpointer( model=model, optimizer=optimizer, scheduler=scheduler, save_dir=save_dir, save_to_disk=save_to_disk, logger=logger ) extra_checkpoint_data = checkpointer.load(cfg.CHECKPOINTER.LOAD_NAME) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) evaluate = cfg.SOLVER.EVALUATE if evaluate: synchronize() data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) synchronize() else: data_loader_val = None save_to_disk = get_rank() == 0 if cfg.SUMMARY_WRITER and save_to_disk: save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) summary_writer = make_summary_writer(cfg.SUMMARY_WRITER, save_dir, model_name=cfg.MODEL.NAME) else: summary_writer = None do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, arguments, summary_writer ) return model
def train(): print(args.local_rank) torch.cuda.set_device(args.local_rank) # create dataloader & network & optimizer model, model_fn_decorator, net_func = create_model(cfg) init_weights(model, init_type='kaiming') model.cuda() root_result_dir = args.output_dir os.makedirs(root_result_dir, exist_ok=True) log_file = os.path.join(root_result_dir, "log_train.txt") logger = create_logger(log_file, get_rank()) logger.info("**********************Start logging**********************") logger.info('TRAINED MODEL:{}'.format(net_func)) # log to file gpu_list = os.environ[ 'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys( ) else 'ALL' logger.info("CUDA_VISIBLE_DEVICES=%s" % gpu_list) for key, val in vars(args).items(): logger.info("{:16} {}".format(key, val)) logger.info("***********************config infos**********************") for key, val in vars(cfg).items(): logger.info("{:16} {}".format(key, val)) # log tensorboard if get_rank() == 0: tb_log = SummaryWriter( log_dir=os.path.join(root_result_dir, "tensorboard")) else: tb_log = None train_loader, test_loader = create_dataloader() # train_loader, test_loader = create_dataloader_Insensee() optimizer = create_optimizer(model) # load checkpoint if it is possible start_epoch = it = best_res = 0 last_epoch = -1 if args.ckpt is not None: pure_model = model it, start_epoch, best_res = load_checkpoint(pure_model, optimizer, args.ckpt, logger) last_epoch = start_epoch + 1 lr_scheduler = create_scheduler(optimizer, last_epoch=last_epoch) # lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.98, last_epoch=-1) criterion = None # start training logger.info('**********************Start training**********************') ckpt_dir = os.path.join(root_result_dir, "ckpt") os.makedirs(ckpt_dir, exist_ok=True) trainer = train_utils.Trainer(model, model_fn=model_fn_decorator(), criterion=criterion, optimizer=optimizer, ckpt_dir=ckpt_dir, lr_scheduler=lr_scheduler, model_fn_eval=model_fn_decorator(), tb_log=tb_log, logger=logger, eval_frequency=1, cfg=cfg) trainer.train(start_it=it, start_epoch=start_epoch, n_epochs=args.epochs, train_loader=train_loader, test_loader=test_loader, ckpt_save_interval=args.ckpt_save_interval, best_res=best_res) logger.info('**********************End training**********************')
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu args.batch_size_total = args.batch_size * args.world_size #rescale base lr args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max( 1, args.batch_size_total // 256)) # set random seed, make sure all random subgraph generated would be the same random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu: torch.cuda.manual_seed(args.seed) global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging(args.logging_save_path, 'w') logger.info( f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \ gpu per node {ngpus_per_node}, world size {args.world_size}" ) # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank args.local_rank = args.gpu torch.cuda.set_device(args.gpu) # build model logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) # use sync batchnorm if getattr(args, 'sync_bn', False): model.apply(lambda m: setattr(m, 'need_sync', True)) model = comm.get_parallel_model(model, args.gpu) #local rank logger.info(model) criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda( args.gpu) soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max, args.iw_clip).cuda(args.gpu) if not getattr(args, 'inplace_distill', True): soft_criterion = None ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) args.n_iters_per_epoch = len(train_loader) logger.info(f'building optimizer and lr scheduler, \ local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}' ) optimizer = build_optimizer(args, model) lr_scheduler = build_lr_scheduler(args, optimizer) # optionally resume from a checkpoint if args.resume: saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger) logger.info(args) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) args.curr_epoch = epoch logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0])) # train for one epoch acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \ soft_criterion=soft_criterion, lr_scheduler=lr_scheduler) if comm.is_master_process() or args.distributed: # validate supernet model validate(train_loader, val_loader, model, criterion, args) if comm.is_master_process(): # save checkpoints saver.save_checkpoint( args.checkpoint_save_path, model, optimizer, lr_scheduler, args, epoch, )
def main(pargs): # this should be global global have_wandb #init distributed training comm.init(pargs.wireup_method) comm_rank = comm.get_rank() comm_local_rank = comm.get_local_rank() comm_size = comm.get_size() # set up logging pargs.logging_frequency = max([pargs.logging_frequency, 1]) log_file = os.path.normpath( os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log")) logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.") logger.log_start(key="init_start", sync=True) logger.log_event(key="cache_clear") #set seed seed = 333 logger.log_event(key="seed", value=seed) # Some setup torch.manual_seed(seed) if torch.cuda.is_available(): device = torch.device("cuda", comm_local_rank) torch.cuda.manual_seed(seed) #necessary for AMP to work torch.cuda.set_device(device) # TEST: allowed? Valuable? #torch.backends.cudnn.benchark = True else: device = torch.device("cpu") #visualize? visualize = (pargs.training_visualization_frequency > 0) or (pargs.validation_visualization_frequency > 0) #set up directories root_dir = os.path.join(pargs.data_dir_prefix) output_dir = pargs.output_dir plot_dir = os.path.join(output_dir, "plots") if comm_rank == 0: if not os.path.isdir(output_dir): os.makedirs(output_dir) if visualize and not os.path.isdir(plot_dir): os.makedirs(plot_dir) # Setup WandB if not pargs.enable_wandb: have_wandb = False if have_wandb and (comm_rank == 0): # get wandb api token certfile = os.path.join(pargs.wandb_certdir, ".wandbirc") try: with open(certfile) as f: token = f.readlines()[0].replace("\n", "").split() wblogin = token[0] wbtoken = token[1] except IOError: print("Error, cannot open WandB certificate {}.".format(certfile)) have_wandb = False if have_wandb: # log in: that call can be blocking, it should be quick sp.call(["wandb", "login", wbtoken]) #init db and get config resume_flag = pargs.run_tag if pargs.resume_logging else False wandb.init(entity=wblogin, project='deepcam', name=pargs.run_tag, id=pargs.run_tag, resume=resume_flag) config = wandb.config #set general parameters config.root_dir = root_dir config.output_dir = pargs.output_dir config.max_epochs = pargs.max_epochs config.local_batch_size = pargs.local_batch_size config.num_workers = comm_size config.channels = pargs.channels config.optimizer = pargs.optimizer config.start_lr = pargs.start_lr config.adam_eps = pargs.adam_eps config.weight_decay = pargs.weight_decay config.model_prefix = pargs.model_prefix config.amp_opt_level = pargs.amp_opt_level config.loss_weight_pow = pargs.loss_weight_pow config.lr_warmup_steps = pargs.lr_warmup_steps config.lr_warmup_factor = pargs.lr_warmup_factor # lr schedule if applicable if pargs.lr_schedule: for key in pargs.lr_schedule: config.update( {"lr_schedule_" + key: pargs.lr_schedule[key]}, allow_val_change=True) # Logging hyperparameters logger.log_event(key="global_batch_size", value=(pargs.local_batch_size * comm_size)) logger.log_event(key="opt_name", value=pargs.optimizer) logger.log_event(key="opt_base_learning_rate", value=pargs.start_lr * pargs.lr_warmup_factor) logger.log_event(key="opt_learning_rate_warmup_steps", value=pargs.lr_warmup_steps) logger.log_event(key="opt_learning_rate_warmup_factor", value=pargs.lr_warmup_factor) logger.log_event(key="opt_epsilon", value=pargs.adam_eps) # Define architecture n_input_channels = len(pargs.channels) n_output_channels = 3 net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels, n_classes=n_output_channels, os=16, pretrained=False, rank=comm_rank) net.to(device) #select loss loss_pow = pargs.loss_weight_pow #some magic numbers class_weights = [ 0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow, 0.01327431072255291**loss_pow ] fpw_1 = 2.61461122397522257612 fpw_2 = 1.71641974795896018744 criterion = losses.fp_loss #select optimizer optimizer = None if pargs.optimizer == "Adam": optimizer = optim.Adam(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) elif pargs.optimizer == "AdamW": optimizer = optim.AdamW(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) elif have_apex and (pargs.optimizer == "LAMB"): optimizer = aoptim.FusedLAMB(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) else: raise NotImplementedError("Error, optimizer {} not supported".format( pargs.optimizer)) if have_apex: #wrap model and opt into amp net, optimizer = amp.initialize(net, optimizer, opt_level=pargs.amp_opt_level) #make model distributed net = DDP(net) #restart from checkpoint if desired #if (comm_rank == 0) and (pargs.checkpoint): #load it on all ranks for now if pargs.checkpoint: checkpoint = torch.load(pargs.checkpoint, map_location=device) start_step = checkpoint['step'] start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) net.load_state_dict(checkpoint['model']) if have_apex: amp.load_state_dict(checkpoint['amp']) else: start_step = 0 start_epoch = 0 #select scheduler if pargs.lr_schedule: scheduler_after = ph.get_lr_schedule(pargs.start_lr, pargs.lr_schedule, optimizer, last_step=start_step) # LR warmup if pargs.lr_warmup_steps > 0: if have_warmup_scheduler: scheduler = GradualWarmupScheduler( optimizer, multiplier=pargs.lr_warmup_factor, total_epoch=pargs.lr_warmup_steps, after_scheduler=scheduler_after) # Throw an error if the package is not found else: raise Exception( f'Requested {pargs.lr_warmup_steps} LR warmup steps ' 'but warmup scheduler not found. Install it from ' 'https://github.com/ildoonet/pytorch-gradual-warmup-lr') else: scheduler = scheduler_after #broadcast model and optimizer state steptens = torch.tensor(np.array([start_step, start_epoch]), requires_grad=False).to(device) dist.broadcast(steptens, src=0) ##broadcast model and optimizer state #hvd.broadcast_parameters(net.state_dict(), root_rank = 0) #hvd.broadcast_optimizer_state(optimizer, root_rank = 0) #unpack the bcasted tensor start_step = steptens.cpu().numpy()[0] start_epoch = steptens.cpu().numpy()[1] # Set up the data feeder # train train_dir = os.path.join(root_dir, "train") train_set = cam.CamDataset(train_dir, statsfile=os.path.join(root_dir, 'stats.h5'), channels=pargs.channels, allow_uneven_distribution=False, shuffle=True, preprocess=True, comm_size=comm_size, comm_rank=comm_rank) train_loader = DataLoader( train_set, pargs.local_batch_size, num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]), pin_memory=True, drop_last=True) # validation: we only want to shuffle the set if we are cutting off validation after a certain number of steps validation_dir = os.path.join(root_dir, "validation") validation_set = cam.CamDataset(validation_dir, statsfile=os.path.join( root_dir, 'stats.h5'), channels=pargs.channels, allow_uneven_distribution=True, shuffle=(pargs.max_validation_steps is not None), preprocess=True, comm_size=comm_size, comm_rank=comm_rank) # use batch size = 1 here to make sure that we do not drop a sample validation_loader = DataLoader( validation_set, 1, num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]), pin_memory=True, drop_last=True) # log size of datasets logger.log_event(key="train_samples", value=train_set.global_size) if pargs.max_validation_steps is not None: val_size = min([ validation_set.global_size, pargs.max_validation_steps * pargs.local_batch_size * comm_size ]) else: val_size = validation_set.global_size logger.log_event(key="eval_samples", value=val_size) # do sanity check if pargs.max_validation_steps is not None: logger.log_event(key="invalid_submission") #for visualization #if visualize: # viz = vizc.CamVisualizer() # Train network if have_wandb and (comm_rank == 0): wandb.watch(net) step = start_step epoch = start_epoch current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr( )[0] stop_training = False net.train() # start trining logger.log_end(key="init_stop", sync=True) logger.log_start(key="run_start", sync=True) # training loop while True: # start epoch logger.log_start(key="epoch_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) # epoch loop for inputs, label, filename in train_loader: # send to device inputs = inputs.to(device) label = label.to(device) # forward pass outputs = net.forward(inputs) # Compute loss and average across nodes loss = criterion(outputs, label, weight=class_weights, fpw_1=fpw_1, fpw_2=fpw_2) # Backprop optimizer.zero_grad() if have_apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() # step counter step += 1 if pargs.lr_schedule: current_lr = scheduler.get_last_lr()[0] scheduler.step() #visualize if requested #if (step % pargs.training_visualization_frequency == 0) and (comm_rank == 0): # # Compute predictions # predictions = torch.max(outputs, 1)[1] # # # extract sample id and data tensors # sample_idx = np.random.randint(low=0, high=label.shape[0]) # plot_input = inputs.detach()[sample_idx, 0,...].cpu().numpy() # plot_prediction = predictions.detach()[sample_idx,...].cpu().numpy() # plot_label = label.detach()[sample_idx,...].cpu().numpy() # # # create filenames # outputfile = os.path.basename(filename[sample_idx]).replace("data-", "training-").replace(".h5", ".png") # outputfile = os.path.join(plot_dir, outputfile) # # # plot # viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label) # # #log if requested # if have_wandb: # img = Image.open(outputfile) # wandb.log({"train_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step) #log if requested if (step % pargs.logging_frequency == 0): # allreduce for loss loss_avg = loss.detach() dist.reduce(loss_avg, dst=0, op=dist.ReduceOp.SUM) loss_avg_train = loss_avg.item() / float(comm_size) # Compute score predictions = torch.max(outputs, 1)[1] iou = utils.compute_score(predictions, label, device_id=device, num_classes=3) iou_avg = iou.detach() dist.reduce(iou_avg, dst=0, op=dist.ReduceOp.SUM) iou_avg_train = iou_avg.item() / float(comm_size) logger.log_event(key="learning_rate", value=current_lr, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) logger.log_event(key="train_accuracy", value=iou_avg_train, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) logger.log_event(key="train_loss", value=loss_avg_train, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) if have_wandb and (comm_rank == 0): wandb.log( {"train_loss": loss_avg.item() / float(comm_size)}, step=step) wandb.log( {"train_accuracy": iou_avg.item() / float(comm_size)}, step=step) wandb.log({"learning_rate": current_lr}, step=step) wandb.log({"epoch": epoch + 1}, step=step) # validation step if desired if (step % pargs.validation_frequency == 0): logger.log_start(key="eval_start", metadata={'epoch_num': epoch + 1}) #eval net.eval() count_sum_val = torch.Tensor([0.]).to(device) loss_sum_val = torch.Tensor([0.]).to(device) iou_sum_val = torch.Tensor([0.]).to(device) # disable gradients with torch.no_grad(): # iterate over validation sample step_val = 0 # only print once per eval at most visualized = False for inputs_val, label_val, filename_val in validation_loader: #send to device inputs_val = inputs_val.to(device) label_val = label_val.to(device) # forward pass outputs_val = net.forward(inputs_val) # Compute loss and average across nodes loss_val = criterion(outputs_val, label_val, weight=class_weights, fpw_1=fpw_1, fpw_2=fpw_2) loss_sum_val += loss_val #increase counter count_sum_val += 1. # Compute score predictions_val = torch.max(outputs_val, 1)[1] iou_val = utils.compute_score(predictions_val, label_val, device_id=device, num_classes=3) iou_sum_val += iou_val # Visualize #if (step_val % pargs.validation_visualization_frequency == 0) and (not visualized) and (comm_rank == 0): # #extract sample id and data tensors # sample_idx = np.random.randint(low=0, high=label_val.shape[0]) # plot_input = inputs_val.detach()[sample_idx, 0,...].cpu().numpy() # plot_prediction = predictions_val.detach()[sample_idx,...].cpu().numpy() # plot_label = label_val.detach()[sample_idx,...].cpu().numpy() # # #create filenames # outputfile = os.path.basename(filename[sample_idx]).replace("data-", "validation-").replace(".h5", ".png") # outputfile = os.path.join(plot_dir, outputfile) # # #plot # viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label) # visualized = True # # #log if requested # if have_wandb: # img = Image.open(outputfile) # wandb.log({"eval_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step) #increase eval step counter step_val += 1 if (pargs.max_validation_steps is not None ) and step_val > pargs.max_validation_steps: break # average the validation loss dist.all_reduce(count_sum_val, op=dist.ReduceOp.SUM) dist.all_reduce(loss_sum_val, op=dist.ReduceOp.SUM) dist.all_reduce(iou_sum_val, op=dist.ReduceOp.SUM) loss_avg_val = loss_sum_val.item() / count_sum_val.item() iou_avg_val = iou_sum_val.item() / count_sum_val.item() # print results logger.log_event(key="eval_accuracy", value=iou_avg_val, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) logger.log_event(key="eval_loss", value=loss_avg_val, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) # log in wandb if have_wandb and (comm_rank == 0): wandb.log({"eval_loss": loss_avg_val}, step=step) wandb.log({"eval_accuracy": iou_avg_val}, step=step) if (iou_avg_val >= pargs.target_iou): logger.log_event(key="target_accuracy_reached", value=pargs.target_iou, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) stop_training = True # set to train net.train() logger.log_end(key="eval_stop", metadata={'epoch_num': epoch + 1}) #save model if desired if (pargs.save_frequency > 0) and (step % pargs.save_frequency == 0): logger.log_start(key="save_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) if comm_rank == 0: checkpoint = { 'step': step, 'epoch': epoch, 'model': net.state_dict(), 'optimizer': optimizer.state_dict() } if have_apex: checkpoint['amp'] = amp.state_dict() torch.save( checkpoint, os.path.join( output_dir, pargs.model_prefix + "_step_" + str(step) + ".cpt")) logger.log_end(key="save_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) # Stop training? if stop_training: break # log the epoch logger.log_end(key="epoch_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) epoch += 1 # are we done? if epoch >= pargs.max_epochs or stop_training: break # run done logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
def main(pargs): #init distributed training comm_local_group = comm.init(pargs.wireup_method, pargs.batchnorm_group_size) comm_rank = comm.get_rank() comm_local_rank = comm.get_local_rank() comm_size = comm.get_size() comm_local_size = comm.get_local_size() # set up logging pargs.logging_frequency = max([pargs.logging_frequency, 1]) log_file = os.path.normpath( os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log")) logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.") logger.log_start(key="init_start", sync=True) logger.log_event(key="cache_clear") #set seed seed = pargs.seed logger.log_event(key="seed", value=seed) # Some setup torch.manual_seed(seed) if torch.cuda.is_available(): device = torch.device("cuda", comm_local_rank) torch.cuda.manual_seed(seed) torch.cuda.set_device(device) torch.backends.cudnn.benchmark = True else: device = torch.device("cpu") #set up directories root_dir = os.path.join(pargs.data_dir_prefix) output_dir = pargs.output_dir plot_dir = os.path.join(output_dir, "plots") if comm_rank == 0: if not os.path.isdir(output_dir): os.makedirs(output_dir) # logging of rank information logger.log_event(key="number_of_ranks", value=comm_size) logger.log_event(key="number_of_nodes", value=(comm_size // comm_local_size)) logger.log_event(key="accelerators_per_node", value=comm_local_size) # Logging hyperparameters logger.log_event(key="global_batch_size", value=(pargs.local_batch_size * comm_size)) logger.log_event(key="batchnorm_group_size", value=pargs.batchnorm_group_size) logger.log_event(key="gradient_accumulation_frequency", value=pargs.gradient_accumulation_frequency) logger.log_event(key="checkpoint", value=pargs.checkpoint) # Define architecture n_input_channels = len(pargs.channels) n_output_channels = 3 net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels, n_classes=n_output_channels, os=16, pretrained=False, rank=comm_rank, process_group=comm_local_group) net.to(device) #select loss #some magic numbers loss_pow = -0.125 class_weights = [ 0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow, 0.01327431072255291**loss_pow ] # extract loss criterion = losses.CELoss(class_weights).to(device) criterion = torch.jit.script(criterion) #select optimizer optimizer = oh.get_optimizer(pargs, net, logger) #restart from checkpoint if desired if pargs.checkpoint is not None: checkpoint = torch.load(pargs.checkpoint, map_location=device) start_step = checkpoint['step'] start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) net.load_state_dict(checkpoint['model']) else: start_step = 0 start_epoch = 0 #broadcast model and optimizer state steptens = torch.tensor(np.array([start_step, start_epoch]), requires_grad=False).to(device) if dist.is_initialized(): dist.broadcast(steptens, src=0) #unpack the bcasted tensor start_step = int(steptens.cpu().numpy()[0]) start_epoch = int(steptens.cpu().numpy()[1]) #select scheduler scheduler = None if pargs.lr_schedule: pargs.lr_schedule["lr_warmup_steps"] = pargs.lr_warmup_steps pargs.lr_schedule["lr_warmup_factor"] = pargs.lr_warmup_factor scheduler = oh.get_lr_schedule(pargs.start_lr, pargs.lr_schedule, optimizer, logger, last_step=start_step) # print parameters if comm_rank == 0: print(net) print("Total number of elements:", sum(p.numel() for p in net.parameters() if p.requires_grad)) # get input shapes for the upcoming model preprocessing # input_shape: tshape, _ = get_datashapes(pargs, root_dir) input_shape = tuple([tshape[2], tshape[0], tshape[1]]) #distributed model parameters bucket_cap_mb = 25 if pargs.batchnorm_group_size > 1: bucket_cap_mb = 220 # get stream, relevant for graph capture ddp_net = DDP(net, device_ids=[device.index], output_device=device.index, find_unused_parameters=False, broadcast_buffers=False, bucket_cap_mb=bucket_cap_mb, gradient_as_bucket_view=False) # get stats handler here bnstats_handler = bns.BatchNormStatsSynchronize(ddp_net, reduction="mean", inplace=True) # create handles net_validate = ddp_net net_train = ddp_net # Set up the data feeder train_loader, train_size, validation_loader, validation_size = get_dataloaders( pargs, root_dir, device, seed, comm_size, comm_rank) # log size of datasets logger.log_event(key="train_samples", value=train_size) val_size = validation_size logger.log_event(key="eval_samples", value=val_size) # get start steps step = start_step epoch = start_epoch current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr( )[0] stop_training = False net_train.train() # start trining logger.log_end(key="init_stop", sync=True) logger.log_start(key="run_start", sync=True) # training loop while True: # start epoch logger.log_start(key="epoch_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) train_loader.sampler.set_epoch(epoch) # training step = train_step(pargs, comm_rank, comm_size, device, step, epoch, net_train, criterion, optimizer, scheduler, train_loader, logger) # average BN stats bnstats_handler.synchronize() # validation stop_training = validate(pargs, comm_rank, comm_size, device, step, epoch, net_validate, criterion, validation_loader, logger) # log the epoch logger.log_end(key="epoch_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) epoch += 1 #save model if desired if (pargs.save_frequency > 0) and (epoch % pargs.save_frequency == 0): logger.log_start(key="save_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) if comm_rank == 0: checkpoint = { 'step': step, 'epoch': epoch, 'model': net_train.state_dict(), 'optimizer': optimizer.state_dict() } torch.save( checkpoint, os.path.join( output_dir, pargs.model_prefix + "_step_" + str(step) + ".cpt")) logger.log_end(key="save_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) # are we done? if (epoch >= pargs.max_epochs) or stop_training: break # run done logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
def inference(model, criterion, data_loader, dataset_name, save_result=False): logger = logging.getLogger('eve.' + __name__) device = torch.device('cuda') dataset = data_loader.dataset logger.info("Start evaluation on {} dataset ({} point clouds).".format( dataset_name, len(dataset))) if get_world_size() == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) start_time = time.time() model.eval() outputs_per_gpu = {} targets_per_gpu = {} file_path_per_gpu = {} times = [] with torch.no_grad(): for batch in tqdm(data_loader, **extra_args): locs, feats, targets, metadata = batch inputs = ME.SparseTensor(feats, coords=locs).to(device) targets = targets.to(device, non_blocking=True).long() torch.cuda.synchronize() start_time = time.time() outputs = model(inputs, y=targets) torch.cuda.synchronize() end_time = time.time() times.append(end_time - start_time) arch = cfg.MODEL.ARCHITECTURE if arch == 'minkunet4d' or arch == 'minkunet_eve': for batch_idx in range(len(metadata)): for time_idx in range(cfg.INPUT.VIDEO.NUM_FRAMES): inv_map = metadata[batch_idx][time_idx]['inverse_map'] file_path = metadata[batch_idx][time_idx]['file_path'] locs_frame = (locs[:, -1] == batch_idx) & \ (locs[:, -2] == time_idx) one_output, one_target = compute_one_frame( outputs, targets, locs_frame, inv_map) outputs_per_gpu[file_path] = one_output targets_per_gpu[file_path] = one_target file_path_per_gpu[file_path] = file_path else: # other minknet for batch_idx in range(len(metadata)): inv_map = metadata[batch_idx]['inverse_map'] file_path = metadata[batch_idx]['file_path'] # From MinkowskiEngine v0.3, batch index is on the first column locs_frame = locs[:, -1] == batch_idx one_output, one_target = compute_one_frame( outputs, targets, locs_frame, inv_map) outputs_per_gpu[file_path] = one_output targets_per_gpu[file_path] = one_target file_path_per_gpu[file_path] = file_path synchronize() logger.info("Total inference time: {}".format(np.sum(times))) # NOTE: `all_gather` will lead to CUDA out of memory # We use `scatter_gather` to save result of each process # in LOGS.DIR/tmp and will be cleared after gathering. outputs = scatter_gather(outputs_per_gpu) targets = scatter_gather(targets_per_gpu) file_paths = scatter_gather(file_path_per_gpu) if not is_main_process(): return None all_outputs = {k: v.numpy() for o in outputs for k, v in o.items()} all_targets = {k: v.numpy() for t in targets for k, v in t.items()} all_file_paths = {k: v for f in file_paths for k, v in f.items()} assert len(all_outputs) == len(dataset.all_files), \ '%d vs %d' % (len(all_outputs), len(dataset.all_files)) if cfg.LOGS.SAVE_RESULT is False: all_file_paths = None metrics = evaluate(dataset, all_outputs, all_targets, all_file_paths) return metrics
def compute_on_dataset_2stage(model, data_loader, device, logger): # two stage inference, for model with memory features. # first extract features and then do the inference cpu_device = torch.device("cpu") num_devices = get_world_size() dataset = data_loader.dataset if num_devices == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) loader_len = len(data_loader) person_feature_pool = MemoryPool() batch_info_list = [None]*loader_len logger.info("Stage 1: extracting clip features.") start_time = time.time() for i, batch in enumerate(tqdm(data_loader, **extra_args)): slow_clips, fast_clips, boxes, objects, extras, video_ids = batch slow_clips = slow_clips.to(device) fast_clips = fast_clips.to(device) boxes = [box.to(device) for box in boxes] objects = [None if (box is None) else box.to(device) for box in objects] movie_ids = [e["movie_id"] for e in extras] timestamps = [e["timestamp"] for e in extras] with torch.no_grad(): feature = model(slow_clips, fast_clips, boxes, objects, part_forward=0) person_feature = [ft.to(cpu_device) for ft in feature[0]] object_feature = [ft.to(cpu_device) for ft in feature[1]] # store person features into memory pool for movie_id, timestamp, p_ft, o_ft in zip(movie_ids, timestamps, person_feature, object_feature): person_feature_pool[movie_id, timestamp] = p_ft # store other information in list, for further inference batch_info_list[i] = (movie_ids, timestamps, video_ids, object_feature) # gather feature pools from different ranks synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Stage 1 time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) feature_pool = all_gather(person_feature_pool) all_feature_pool_p = MemoryPool() all_feature_pool_p.update_list(feature_pool) del feature_pool, person_feature_pool # do the inference results_dict = {} logger.info("Stage 2: predicting with extracted feature.") start_time = time.time() for movie_ids, timestamps, video_ids, object_feature in tqdm(batch_info_list, **extra_args): current_feat_p = [all_feature_pool_p[movie_id, timestamp].to(device) for movie_id, timestamp in zip(movie_ids, timestamps)] current_feat_o = [ft_o.to(device) for ft_o in object_feature] extras = dict( person_pool=all_feature_pool_p, movie_ids=movie_ids, timestamps=timestamps, current_feat_p=current_feat_p, current_feat_o=current_feat_o, ) with torch.no_grad(): output = model(None, None, None, None, extras=extras, part_forward=1) output = [o.to(cpu_device) for o in output] results_dict.update( {video_id: result for video_id, result in zip(video_ids, output)} ) synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Stage 2 time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) return results_dict
def do_train(cfg, model, data_loader_train, data_loader_val, optimizer, scheduler, checkpointer, device, arguments, summary_writer): # get logger logger = logging.getLogger(cfg.NAME) logger.info("Start training ...") logger.info("Size of training dataset: %s" % (data_loader_train.dataset.__len__())) logger.info("Size of validation dataset: %s" % (data_loader_val.dataset.__len__())) model.train() meters = MetricLogger(delimiter=" ") max_iter = len(data_loader_train) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() bar = TqdmBar(data_loader_train, start_iter, get_rank(), data_loader_train.__len__(), description="Training", use_bar=cfg.USE_BAR) for iteration, record in bar.bar: data_time = time.time() - end iteration += 1 arguments["iteration"] = iteration record = move_to_device(record, device) loss, _ = model(record) optimizer.zero_grad() loss["total_loss"].backward() optimizer.step() scheduler.step() # reduce losses over all GPUs for logging purposes loss_reduced = {key: value.cpu().item() for key, value in loss.items()} meters.update(**loss_reduced) batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) lr = optimizer.param_groups[0]["lr"] bar.set_postfix({"lr": lr, "total_loss": loss_reduced["total_loss"]}) if iteration % cfg.SOLVER.LOGGER_PERIOD == 0 or iteration == max_iter: bar.clear(nolock=True) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.6f}", "{meters}", "eta: {eta}", "mem: {memory:.0f}", ]).format( iter=iteration, lr=lr, meters=str(meters), eta=eta_string, memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if summary_writer: write_summary(summary_writer, iteration, record=loss, group='Losses') write_summary(summary_writer, iteration, record={'lr': lr}, group='LR') if iteration % cfg.SOLVER.CHECKPOINT_PERIOD == 0: bar.clear(nolock=True) checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None: do_evaluation(cfg, model, data_loader_val, device, arguments, summary_writer) checkpointer.save("model_final", **arguments) bar.close() total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter))
delimiter=" ", ) for test_tb_log_dir in test_tb_log_dirs ] if cfg.EVALUATE: for task_name, testloader, test_meter in zip(task_names, testloaders, test_meters): logging.info("Evaluating dataset: {}".format(task_name)) validate(testloader, net, criterion_eval, cfg, test_meter, global_step=0, device=device, local_rank=get_rank()) ############## training code ############################# if not cfg.EVALUATE: scaler = torch.cuda.amp.GradScaler(enabled=cfg.AMP.ENABLED) # start from epoch 0 or last checkpoint epoch start_epoch = checkpointer.epoch for epoch in range(start_epoch, cfg.OPTIM.EPOCHS): # wait for all processes before every epoch synchronize() logging.info("PROGRESS: {}%".format( round(100 * epoch / cfg.OPTIM.EPOCHS, 4))) global_step = epoch * len(trainloader) # an empirical rule for redraw projects in Performer if cfg.MODEL.ARCH.startswith(
def main(pargs): #init distributed training comm.init(pargs.wireup_method) comm_rank = comm.get_rank() comm_local_rank = comm.get_local_rank() comm_size = comm.get_size() #set seed seed = 333 # Some setup torch.manual_seed(seed) if torch.cuda.is_available(): printr("Using GPUs", 0) device = torch.device("cuda", comm_local_rank) torch.cuda.manual_seed(seed) #necessary for AMP to work torch.cuda.set_device(device) else: printr("Using CPUs", 0) device = torch.device("cpu") #set up directories root_dir = os.path.join(pargs.data_dir_prefix) output_dir = pargs.output_dir plot_dir = os.path.join(output_dir, "plots") if comm_rank == 0: if not os.path.isdir(output_dir): os.makedirs(output_dir) # Define architecture n_input_channels = len(pargs.channels) n_output_channels = 3 net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels, n_classes=n_output_channels, os=16, pretrained=False, rank=comm_rank) net.to(device) #select loss loss_pow = pargs.loss_weight_pow #some magic numbers class_weights = [ 0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow, 0.01327431072255291**loss_pow ] fpw_1 = 2.61461122397522257612 fpw_2 = 1.71641974795896018744 criterion = losses.fp_loss #select optimizer optimizer = None if pargs.optimizer == "Adam": optimizer = optim.Adam(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) elif pargs.optimizer == "AdamW": optimizer = optim.AdamW(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) elif have_apex and (pargs.optimizer == "LAMB"): optimizer = aoptim.FusedLAMB(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) else: raise NotImplementedError("Error, optimizer {} not supported".format( pargs.optimizer)) if have_apex: #wrap model and opt into amp net, optimizer = amp.initialize(net, optimizer, opt_level=pargs.amp_opt_level) #make model distributed net = DDP(net) #select scheduler if pargs.lr_schedule: scheduler = ph.get_lr_schedule(pargs.start_lr, pargs.lr_schedule, optimizer, last_step=0) # Set up the data feeder # train train_dir = os.path.join(root_dir, "train") train_set = cam.CamDataset(train_dir, statsfile=os.path.join(root_dir, 'stats.h5'), channels=pargs.channels, shuffle=True, preprocess=True, comm_size=comm_size, comm_rank=comm_rank) train_loader = DataLoader( train_set, pargs.local_batch_size, num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]), drop_last=True) printr( '{:14.4f} REPORT: starting warmup'.format( dt.datetime.now().timestamp()), 0) step = 0 current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr( )[0] current_lr = pargs.start_lr net.train() while True: #for inputs_raw, labels, source in train_loader: for inputs, label, filename in train_loader: # Print status if step == pargs.num_warmup_steps: printr( '{:14.4f} REPORT: starting profiling'.format( dt.datetime.now().timestamp()), 0) # Forward pass with Profile(pargs, "Forward", step): #send data to device inputs = inputs.to(device) label = label.to(device) # Compute output outputs = net.forward(inputs) # Compute loss loss = criterion(outputs, label, weight=class_weights, fpw_1=fpw_1, fpw_2=fpw_2) # allreduce for loss loss_avg = loss.detach() dist.reduce(loss_avg, dst=0, op=dist.ReduceOp.SUM) # Compute score predictions = torch.max(outputs, 1)[1] iou = utils.compute_score(predictions, label, device_id=device, num_classes=3) iou_avg = iou.detach() dist.reduce(iou_avg, dst=0, op=dist.ReduceOp.SUM) # Backprop with Profile(pargs, "Backward", step): # reset grads optimizer.zero_grad() # compute grads if have_apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # weight update with Profile(pargs, "Optimizer", step): # update weights optimizer.step() # advance the scheduler if pargs.lr_schedule: current_lr = scheduler.get_last_lr()[0] scheduler.step() #step counter step += 1 #are we done? if step >= (pargs.num_warmup_steps + pargs.num_profile_steps): break #need to check here too if step >= (pargs.num_warmup_steps + pargs.num_profile_steps): break printr( '{:14.4f} REPORT: finishing profiling'.format( dt.datetime.now().timestamp()), 0)
def printr(msg, rank=0): if comm.get_rank() == rank: print(msg)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # Merge config file. cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # Print experimental infos. save_dir = "" logger = setup_logger("AlphAction", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + get_pretty_env_info()) # Build the model. model = build_detection_model(cfg) model.to("cuda") # load weight. output_dir = cfg.OUTPUT_DIR checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir) checkpointer.load(cfg.MODEL.WEIGHT) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST mem_active = has_memory(cfg.IA_STRUCTURE) if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) os.makedirs(output_folder, exist_ok=True) output_folders[idx] = output_folder # Do inference. data_loaders_test = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_test in zip( output_folders, dataset_names, data_loaders_test): inference( model, data_loader_test, dataset_name, mem_active=mem_active, output_folder=output_folder, ) synchronize()
def main(): # Add augments parser = argparse.ArgumentParser(description="Vision Research Toolkit by PyTorch") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true" ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() # make config cfg = make_config(args.config_file, args.opts) # obtain absolute dir of project project_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) if cfg.CHECKPOINTER.DIR: if cfg.CHECKPOINTER.DIR[0] is not os.sep: # if the saver_dir is not absolute dir cfg.CHECKPOINTER.DIR = os.path.join(project_dir, cfg.CHECKPOINTER.DIR) else: cfg.CHECKPOINTER.DIR = os.path.join(project_dir, 'log') if not cfg.CHECKPOINTER.NAME: cfg.CHECKPOINTER.NAME = strftime("%Y-%m-%d-%H-%M-%S", localtime()) cfg.freeze() save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) mkdir(save_dir) # Init logger logger = setup_logger(cfg.NAME, save_dir, get_rank()) logger.info("Using {} GPU".format(num_gpus)) logger.info(args) logger.info("Collecting env info ...") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(save_dir, os.path.basename(args.config_file)) logger.info("Saving config into: {}".format(output_config_path)) # save overloaded model config in the output directory save_config(cfg, output_config_path) train(cfg, args.local_rank, args.distributed) return