def main(): args = parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://", ) comm.synchronize() cfg = get_default_cfg() if args.config_file: cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.output_dir if output_dir: misc.mkdir(output_dir) logger = setup_logger("EfficientDet", output_dir, comm.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(output_dir, 'config.yml') logger.info("Saving config into: {}".format(output_config_path)) misc.save_config(cfg, output_config_path) model = train(cfg, args.local_rank, args.distributed)
def compute_scores_for_inference(self, clusters_mx, per_example_negs): # TODO: add description here args = self.args # get all of the unique examples examples = clusters_mx.data.tolist() examples.extend(flatten(per_example_negs.tolist())) examples = unique(examples) examples = list(filter(lambda x: x >= 0, examples)) # create dataset and dataloader dataset = InferenceEmbeddingDataset(args, examples, args.train_cache_dir) dataloader = InferenceEmbeddingDataLoader(args, dataset) # get the unique idxs and embeds for each idx idxs, embeds = self.get_embeddings(dataloader, evaluate=False) sparse_graph = None if get_rank() == 0: # create inverse index for mapping inverse_idxs = {v: k for k, v in enumerate(idxs)} ## make the list of pairs of dot products we need _row = clusters_mx.row # positives: local_pos_a, local_pos_b = np.where( np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1)) pos_a = clusters_mx.data[local_pos_a] pos_b = clusters_mx.data[local_pos_b] # negatives: local_neg_a = np.tile( np.arange(per_example_negs.shape[0])[:, np.newaxis], (1, per_example_negs.shape[1])).flatten() neg_a = clusters_mx.data[local_neg_a] neg_b = per_example_negs.flatten() neg_mask = (neg_b != -1) neg_a = neg_a[neg_mask] neg_b = neg_b[neg_mask] # create subset of the sparse graph we care about a = np.concatenate((pos_a, neg_a), axis=0) b = np.concatenate((pos_b, neg_b), axis=0) edges = list(zip(a, b)) affinities = [ np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]]) for i, j in edges ] # convert to coo_matrix edges = np.asarray(edges).T affinities = np.asarray(affinities) _sparse_num = np.max(edges) + 1 sparse_graph = coo_matrix((affinities, edges), shape=(_sparse_num, _sparse_num)) synchronize() return sparse_graph
def get_embeddings(self, dataloader, evaluate=True): args = self.args self.model.eval() local_step = 0 push_to_cpu_steps = 32 idxs_list = [] embeds_list = [] master_idxs_list = [] master_embeds_list = [] def _synchronize_lists(_embeds_list, _idxs_list): gathered_data = all_gather({ 'embeds_list': _embeds_list, 'idxs_list': _idxs_list, }) if get_rank() == 0: _embeds_list = [d['embeds_list'] for d in gathered_data] _embeds_list = flatten(_embeds_list) _embeds_list = [x.cpu() for x in _embeds_list] _idxs_list = [d['idxs_list'] for d in gathered_data] _idxs_list = flatten(_idxs_list) _idxs_list = [x.cpu() for x in _idxs_list] master_embeds_list.extend(_embeds_list) master_idxs_list.extend(_idxs_list) synchronize() return [], [] batch_iterator = tqdm(dataloader, desc='Getting embeddings...', disable=(not evaluate or get_rank() != 0 or args.disable_logging)) for batch in batch_iterator: batch = tuple(t.to(args.device, non_blocking=True) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'concat_input': False } embeds_list.append(self.model(**inputs)) idxs_list.append(batch[0]) local_step += 1 if local_step % push_to_cpu_steps == 0: embeds_list, idxs_list = _synchronize_lists( embeds_list, idxs_list) embeds_list, idxs_list = _synchronize_lists(embeds_list, idxs_list) idxs, embeds = None, None if get_rank() == 0: idxs = torch.cat(master_idxs_list, dim=0).numpy() idxs, indices = np.unique(idxs, return_index=True) embeds = torch.cat(master_embeds_list, dim=0).numpy() embeds = embeds[indices] synchronize() return idxs, embeds
def setup_gpu(): num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: print("use gpu...") torch.cuda.set_device(0) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() return distributed
def _train_softmax(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] self.model.train() self.model.zero_grad() criterion = nn.CrossEntropyLoss() for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = SoftmaxEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'concat_input': False } outputs = self.model(**inputs) pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] * outputs[:, 1:, :], dim=-1) target = torch.zeros(pos_neg_dot_prods.shape[0], dtype=torch.long).cuda() loss = criterion(pos_neg_dot_prods, target) losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'losses': losses, }) if get_rank() == 0: losses = flatten([d['losses'] for d in gathered_data]) loss = np.mean(losses) synchronize() return {'embed_loss': loss} else: synchronize() return None
def _train_threshold(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] self.model.train() self.model.zero_grad() random.shuffle(dataset_list) for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = ScaledPairsEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[2], 'attention_mask': batch[3], 'token_type_ids': batch[4], 'concat_input': False } outputs = self.model(**inputs) dot_prods = torch.sum(outputs[:, 0, :] * outputs[:, 1, :], dim=-1) loss = torch.mean(F.relu(args.margin - (batch[0] * dot_prods))) losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'losses': losses, }) if get_rank() == 0: losses = flatten([d['losses'] for d in gathered_data]) loss = np.mean(losses) synchronize() return {'embed_loss': loss} else: synchronize() return None
def inference( model, data_loader, dataset_name, iou_types=("bbox", ), box_only=False, device="cuda", expected_results=(), expected_results_sigma_tol=4, output_folder=None, ): # convert to a torch.device for efficiency device = torch.device(device) num_devices = (torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1) logger = logging.getLogger("maskrcnn_benchmark.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format( dataset_name, len(dataset))) start_time = time.time() predictions = compute_on_dataset(model, data_loader, device) # wait for all processes to complete before measuring the time synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Total inference time: {} ({} s / img per device, on {} devices)". format(total_time_str, total_time * num_devices / len(dataset), num_devices)) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) extra_args = dict( box_only=box_only, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) return evaluate(dataset=dataset, predictions=predictions, output_folder=output_folder, **extra_args)
def _synchronize_lists(_embeds_list, _idxs_list): gathered_data = all_gather({ 'embeds_list': _embeds_list, 'idxs_list': _idxs_list, }) if get_rank() == 0: _embeds_list = [d['embeds_list'] for d in gathered_data] _embeds_list = flatten(_embeds_list) _embeds_list = [x.cpu() for x in _embeds_list] _idxs_list = [d['idxs_list'] for d in gathered_data] _idxs_list = flatten(_idxs_list) _idxs_list = [x.cpu() for x in _idxs_list] master_embeds_list.extend(_embeds_list) master_idxs_list.extend(_idxs_list) synchronize() return [], []
def init_pretrained_weights(key): """Initializes model with pretrained weights. Layers that don't match with pretrained layers in name or size are kept unchanged. """ import os import errno import gdown def _get_torch_home(): ENV_TORCH_HOME = 'TORCH_HOME' ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' DEFAULT_CACHE_DIR = '~/.cache' torch_home = os.path.expanduser( os.getenv( ENV_TORCH_HOME, os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'torch'))) return torch_home torch_home = _get_torch_home() model_dir = os.path.join(torch_home, 'checkpoints') try: os.makedirs(model_dir) except OSError as e: if e.errno == errno.EEXIST: # Directory already exists, ignore. pass else: # Unexpected OSError, re-raise. raise filename = model_urls[key].split('/')[-1] cached_file = os.path.join(model_dir, filename) if not os.path.exists(cached_file): if comm.is_main_process(): gdown.download(model_urls[key], cached_file, quiet=False) comm.synchronize() logger.info(f"Loading pretrained model from {cached_file}") state_dict = torch.load(cached_file, map_location=torch.device('cpu')) return state_dict
def _build_temp_sparse_graph(self, clusters_mx, per_example_negs): args = self.args # get all of the unique examples examples = clusters_mx.data.tolist() examples.extend(flatten(per_example_negs.tolist())) examples = unique(examples) examples = list(filter(lambda x: x >= 0, examples)) sparse_graph = None if get_rank() == 0: ## make the list of pairs of dot products we need _row = clusters_mx.row # positives: local_pos_a, local_pos_b = np.where( np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1)) pos_a = clusters_mx.data[local_pos_a] pos_b = clusters_mx.data[local_pos_b] # negatives: local_neg_a = np.tile( np.arange(per_example_negs.shape[0])[:, np.newaxis], (1, per_example_negs.shape[1])).flatten() neg_a = clusters_mx.data[local_neg_a] neg_b = per_example_negs.flatten() neg_mask = (neg_b != -1) neg_a = neg_a[neg_mask] neg_b = neg_b[neg_mask] # create subset of the sparse graph we care about a = np.concatenate((pos_a, neg_a), axis=0) b = np.concatenate((pos_b, neg_b), axis=0) edges = list(zip(a, b)) affinities = [0.0 for i, j in edges] # convert to coo_matrix edges = np.asarray(edges).T affinities = np.asarray(affinities) _sparse_num = np.max(edges) + 1 sparse_graph = coo_matrix((affinities, edges), shape=(_sparse_num, _sparse_num)) synchronize() return sparse_graph
def cache_url(url, model_dir=None, progress=True): r"""Loads the Torch serialized object at the given URL. If the object is already present in `model_dir`, it's deserialized and returned. The filename part of the URL should follow the naming convention ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more digits of the SHA256 hash of the contents of the file. The hash is used to ensure unique names and to verify the contents of the file. The default value of `model_dir` is ``$TORCH_HOME/models`` where ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be overridden with the ``$TORCH_MODEL_ZOO`` environment variable. Args: url (string): URL of the object to download model_dir (string, optional): directory in which to save the object progress (bool, optional): whether or not to display a progress bar to stderr Example: >>> cached_file = utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') """ if model_dir is None: torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch")) model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models")) if not os.path.exists(model_dir): os.makedirs(model_dir) parts = urlparse(url) filename = os.path.basename(parts.path) if filename == "model_final.pkl": # workaround as pre-trained Caffe2 models from Detectron have all the same filename # so make the full path the filename by replacing / with _ filename = parts.path.replace("/", "_") cached_file = os.path.join(model_dir, filename) if not os.path.exists(cached_file) and is_main_process(): sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) hash_prefix = HASH_REGEX.search(filename) if hash_prefix is not None: hash_prefix = hash_prefix.group(1) # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, # which matches the hash PyTorch uses. So we skip the hash matching # if the hash_prefix is less than 6 characters if len(hash_prefix) < 6: hash_prefix = None _download_url_to_file(url, cached_file, hash_prefix, progress=progress) synchronize() return cached_file
def val_in_train(model, criterion, dataset_name_val, data_loader_val, tblogger, iteration, checkpointer, distributed): logger = logging.getLogger('eve.' + __name__) if distributed: model_val = model.module else: model_val = model # only main process will return result metrics = inference(model_val, criterion, data_loader_val, dataset_name_val) synchronize() if is_main_process(): if tblogger is not None: for k, v in metrics.items(): tblogger.add_scalar('val/' + k, v, iteration) logger.info("{}: {}".format(k, v)) return metrics else: return None
def inference( model, data_loader, dataset_name, mem_active=False, output_folder=None, ): # convert to a torch.device for efficiency device = torch.device("cuda") num_devices = get_world_size() logger = logging.getLogger("AlphAction.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} videos).".format(dataset_name, len(dataset))) start_time = time.time() predictions = compute_on_dataset(model, data_loader, device, logger, mem_active) # wait for all processes to complete before measuring the time synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Total inference time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) return evaluate( dataset=dataset, predictions=predictions, output_folder=output_folder, )
net, criterion_eval, cfg, test_meter, global_step=0, device=device, local_rank=get_rank()) ############## training code ############################# if not cfg.EVALUATE: scaler = torch.cuda.amp.GradScaler(enabled=cfg.AMP.ENABLED) # start from epoch 0 or last checkpoint epoch start_epoch = checkpointer.epoch for epoch in range(start_epoch, cfg.OPTIM.EPOCHS): # wait for all processes before every epoch synchronize() logging.info("PROGRESS: {}%".format( round(100 * epoch / cfg.OPTIM.EPOCHS, 4))) global_step = epoch * len(trainloader) # an empirical rule for redraw projects in Performer if cfg.MODEL.ARCH.startswith( 'msvit') and cfg.MODEL.VIT.MSVIT.ATTN_TYPE == "performer": if hasattr(net, 'module'): net.module.feature_redraw_interval = 1 + 5 * epoch else: net.feature_redraw_interval = 1 + 5 * epoch if cfg.MODEL.ARCH.startswith( 'msvit') and cfg.MODEL.VIT.MSVIT.ATTN_TYPE.startswith( 'longformer'):
def do_infer( model, data_loader, dataset_name, device="cuda", output_folder=None, ): # convert to a torch.device for efficiency device = torch.device(device) num_devices = get_world_size() logger = logging.getLogger("EfficientDet.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format( dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() predictions = compute_on_dataset(model, data_loader, device, inference_timer) # wait for all processes to complete before measuring the time synchronize() total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices)) total_infer_time = get_time_str(inference_timer.total_time) logger.info( "Model inference time: {} ({} s / img per device, on {} devices)". format( total_infer_time, inference_timer.total_time * num_devices / len(dataset), num_devices, )) predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return coco_results = [] image_ids = [] for image_id, prediction in enumerate(predictions): original_id = dataset.image_ids[image_id] image_ids.append(original_id) coco_results.extend([{ "image_id": original_id, "category_id": dataset.return_coco_label(e['class']), "bbox": e['bbox'], "score": e['score'] } for e in prediction]) map_05_09 = 0 with tempfile.NamedTemporaryFile() as f: file_path = f.name output_folder = './' if output_folder: file_path = os.path.join(output_folder, 'bbox_results.json') with open(file_path, "w") as w_obj: json.dump(coco_results, w_obj) # load results in COCO evaluation tool coco_true = dataset.coco coco_pred = coco_true.loadRes(file_path) # run COCO evaluation coco_eval = COCOeval(coco_true, coco_pred, 'bbox') coco_eval.params.imgIds = image_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() map_05_09 = coco_eval.stats[0] return map_05_09
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # Merge config file. cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # Print experimental infos. save_dir = "" logger = setup_logger("AlphAction", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + get_pretty_env_info()) # Build the model. model = build_detection_model(cfg) model.to("cuda") # load weight. output_dir = cfg.OUTPUT_DIR checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir) checkpointer.load(cfg.MODEL.WEIGHT) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST mem_active = has_memory(cfg.IA_STRUCTURE) if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) os.makedirs(output_folder, exist_ok=True) output_folders[idx] = output_folder # Do inference. data_loaders_test = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_test in zip( output_folders, dataset_names, data_loaders_test): inference( model, data_loader_test, dataset_name, mem_active=mem_active, output_folder=output_folder, ) synchronize()
def main(): # Add augments parser = argparse.ArgumentParser(description="Vision Research Toolkit by PyTorch") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true" ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() # make config cfg = make_config(args.config_file, args.opts) # obtain absolute dir of project project_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) if cfg.CHECKPOINTER.DIR: if cfg.CHECKPOINTER.DIR[0] is not os.sep: # if the saver_dir is not absolute dir cfg.CHECKPOINTER.DIR = os.path.join(project_dir, cfg.CHECKPOINTER.DIR) else: cfg.CHECKPOINTER.DIR = os.path.join(project_dir, 'log') if not cfg.CHECKPOINTER.NAME: cfg.CHECKPOINTER.NAME = strftime("%Y-%m-%d-%H-%M-%S", localtime()) cfg.freeze() save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) mkdir(save_dir) # Init logger logger = setup_logger(cfg.NAME, save_dir, get_rank()) logger.info("Using {} GPU".format(num_gpus)) logger.info(args) logger.info("Collecting env info ...") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(save_dir, os.path.basename(args.config_file)) logger.info("Saving config into: {}".format(output_config_path)) # save overloaded model config in the output directory save_config(cfg, output_config_path) train(cfg, args.local_rank, args.distributed) return
def train(cfg, local_rank, distributed): logger = logging.getLogger(cfg.NAME) # build model model = build_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # build solver optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {"iteration": 0} save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) save_to_disk = get_rank() == 0 checkpointer = Checkpointer( model=model, optimizer=optimizer, scheduler=scheduler, save_dir=save_dir, save_to_disk=save_to_disk, logger=logger ) extra_checkpoint_data = checkpointer.load(cfg.CHECKPOINTER.LOAD_NAME) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) evaluate = cfg.SOLVER.EVALUATE if evaluate: synchronize() data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) synchronize() else: data_loader_val = None save_to_disk = get_rank() == 0 if cfg.SUMMARY_WRITER and save_to_disk: save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME) summary_writer = make_summary_writer(cfg.SUMMARY_WRITER, save_dir, model_name=cfg.MODEL.NAME) else: summary_writer = None do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, arguments, summary_writer ) return model
def eval_wdoc(args, example_dir, metadata, sub_trainer, save_fname=None): assert save_fname != None logger.info('Building within doc sparse graphs...') doc_level_graphs = [] per_doc_coref_clusters = [] for doc_clusters in tqdm(metadata.wdoc_clusters.values(), disable=(get_rank() != 0)): per_doc_coref_clusters.append([[x for x in v if x != k] for k, v in doc_clusters.items()]) doc_mentions = np.asarray( [x for k, v in doc_clusters.items() for x in v if x != k]) doc_mentions = np.sort(doc_mentions) doc_level_graphs.append( build_sparse_affinity_graph(args, doc_mentions, example_dir, metadata, None, sub_trainer, build_coref_graph=True, build_linking_graph=True)) logger.info('Done.') # don't need other processes at this point if get_rank() != 0: synchronize() return # build everything needed to compute metrics and compute them! coref_graphs, linking_graphs = [], [] for coref_graph, linking_graph in doc_level_graphs: coref_graphs.append(coref_graph) linking_graphs.append(linking_graph) # build the joint whole graph joint_whole_graph = deepcopy( _merge_sparse_graphs(coref_graphs + linking_graphs)) logger.info('Computing coref metrics...') coref_metrics = compute_coref_metrics(per_doc_coref_clusters, coref_graphs, args.eval_coref_threshold) logger.info('Done.') logger.info('Computing linking metrics...') linking_metrics, slim_linking_graph = compute_linking_metrics( metadata, linking_graphs) logger.info('Done.') ######################################################################## ## FIXME: hacking to get HAC working # #joint_whole_graph = _merge_sparse_graphs(coref_graphs + linking_graphs) #hierarchy_tree = np.full((2*joint_whole_graph.shape[0], 2), -1) #proposed_merges = np.vstack((joint_whole_graph.row, joint_whole_graph.col)).T #def _get_leaves(hierarchy_tree, internal_node): # q = [internal_node] # leaves = [] # while len(q) > 0: # curr_node = q.pop() # left_child = hierarchy_tree[curr_node][0] # right_child = hierarchy_tree[curr_node][1] # if left_child == -1: # assert right_child == -1 # leaves.append(curr_node) # else: # q.append(left_child) # q.append(right_child) # return leaves #def _avg_linkage(joint_whole_graph, leaves_a, leaves_b): # row_mask = np.isin(joint_whole_graph.row, leaves_a)\ # ^ np.isin(joint_whole_graph.row, leaves_b) # col_mask = np.isin(joint_whole_graph.col, leaves_a)\ # ^ np.isin(joint_whole_graph.col, leaves_b) # edge_weights = joint_whole_graph.data[row_mask & col_mask] # if edge_weights.size == 0: # return -np.inf # return np.mean(edge_weights) # #merge_node_id = joint_whole_graph.shape[0] # start with the next possible index #valid_merge_exists = True #count = 0 #while valid_merge_exists: # valid_merge_exists = False # max_linkage = 0.0 # max_a, max_b = None, None # for pair in proposed_merges: # a, b = tuple(pair) # if a == b: # continue # valid_merge_exists = True # leaves_a = _get_leaves(hierarchy_tree, a) # leaves_b = _get_leaves(hierarchy_tree, b) # linkage_score = _avg_linkage(joint_whole_graph, leaves_a, leaves_b) # if linkage_score > max_linkage: # max_a = a # max_b = b # max_linkage = linkage_score # if not valid_merge_exists: # continue # # create new node in the hierarchy with id = `merge_node_id` # hierarchy_tree[merge_node_id][0] = max_a # hierarchy_tree[merge_node_id][1] = max_b # # update all the relevant edges in `proposed_merges` # join_mask = np.isin(proposed_merges, [max_a, max_b]) # proposed_merges[join_mask] = merge_node_id # # increment for next merger # merge_node_id += 1 # count += 1 # print(count) ######################################################################## logger.info('Computing joint metrics...') slim_coref_graph = _get_global_maximum_spanning_tree(coref_graphs) joint_metrics = compute_joint_metrics( metadata, [slim_coref_graph, slim_linking_graph]) logger.info('Done.') metrics = { 'coref_fmi': coref_metrics['fmi'], 'coref_rand_index': coref_metrics['rand_index'], 'coref_threshold': coref_metrics['threshold'], 'vanilla_recall': linking_metrics['vanilla_recall'], 'vanilla_accuracy': linking_metrics['vanilla_accuracy'], 'joint_accuracy': joint_metrics['joint_accuracy'], 'joint_cc_recall': joint_metrics['joint_cc_recall'] } # save all of the predictions for later analysis save_data = {} save_data.update(coref_metrics) save_data.update(linking_metrics) save_data.update(joint_metrics) save_data.update({'metadata': metadata}) save_data.update({'joint_whole_graph': joint_whole_graph}) with open(save_fname, 'wb') as f: pickle.dump(save_data, f) synchronize() return metrics
def do_train( cfg, model, train_dataloader, val_dataloader, optimizer, lr_scheduler, checkpointer, device, checkpoint_period, test_period, log_period, arguments, ): logger = logging.getLogger("EfficientDet.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(train_dataloader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(train_dataloader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() # lr_scheduler.step(losses_reduced) lr_scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % log_period == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if val_dataloader is not None and ( (test_period > 0 and iteration % test_period == 0) or iteration == max_iter): meters_val = MetricLogger(delimiter=" ") synchronize() map_05_09 = do_infer( # The result can be used for additional logging, e. g. for TensorBoard model, val_dataloader, dataset_name="[Validation]", device=cfg.device, output_folder=None, ) logger.info("Validation MAP 0.5:0.9 ===> {}".format(map_05_09)) synchronize() model.train() if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def initialize_exp(args, logger_filename='train.log'): """ Initialize the experiment: - dump parameters - create a logger - set the random seed - setup distributed computation """ # setup cuda using torch's distributed framework setup_cuda_and_distributed(args) # random seed set_seed(args) # don't overwrite previous output directory if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError("Output directory ({}) already exists and is not " "empty. Use --overwrite_output_dir " "to overcome.".format(args.output_dir)) # create output directory and dump parameters if get_rank() == 0: # create output directory if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) time.sleep(3) # args file prefix if args.do_train: prefix = "train" elif args.do_train_eval: prefix = "train_eval" elif args.do_val: prefix = "val" elif args.do_test: prefix = "test" else: raise ValueError("No valid train or validation mode selected") args_file = prefix + "_args.pkl" pickle.dump(args, open(os.path.join(args.output_dir, args_file), "wb")) synchronize() # get running command command = ["python", sys.argv[0]] for x in sys.argv[1:]: if x.startswith('--'): assert '"' not in x and "'" not in x command.append(x) else: assert "'" not in x command.append("'%s'" % x) command = ' '.join(command) args.command = command # create a logger logger = create_logger(args, logger_filename) logger.info('============ Initialized logger ============') logger.info('\n'.join( ['%s: %s' % (k, str(v)) for k, v in sorted(dict(vars(args)).items())])) logger.info('The experiment will be stored in %s\n' % args.output_dir) logger.info('Running command: %s\n' % args.command) return logger
def inference(model, criterion, data_loader, dataset_name, save_result=False): logger = logging.getLogger('eve.' + __name__) device = torch.device('cuda') dataset = data_loader.dataset logger.info("Start evaluation on {} dataset ({} point clouds).".format( dataset_name, len(dataset))) if get_world_size() == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) start_time = time.time() model.eval() outputs_per_gpu = {} targets_per_gpu = {} file_path_per_gpu = {} times = [] with torch.no_grad(): for batch in tqdm(data_loader, **extra_args): locs, feats, targets, metadata = batch inputs = ME.SparseTensor(feats, coords=locs).to(device) targets = targets.to(device, non_blocking=True).long() torch.cuda.synchronize() start_time = time.time() outputs = model(inputs, y=targets) torch.cuda.synchronize() end_time = time.time() times.append(end_time - start_time) arch = cfg.MODEL.ARCHITECTURE if arch == 'minkunet4d' or arch == 'minkunet_eve': for batch_idx in range(len(metadata)): for time_idx in range(cfg.INPUT.VIDEO.NUM_FRAMES): inv_map = metadata[batch_idx][time_idx]['inverse_map'] file_path = metadata[batch_idx][time_idx]['file_path'] locs_frame = (locs[:, -1] == batch_idx) & \ (locs[:, -2] == time_idx) one_output, one_target = compute_one_frame( outputs, targets, locs_frame, inv_map) outputs_per_gpu[file_path] = one_output targets_per_gpu[file_path] = one_target file_path_per_gpu[file_path] = file_path else: # other minknet for batch_idx in range(len(metadata)): inv_map = metadata[batch_idx]['inverse_map'] file_path = metadata[batch_idx]['file_path'] # From MinkowskiEngine v0.3, batch index is on the first column locs_frame = locs[:, -1] == batch_idx one_output, one_target = compute_one_frame( outputs, targets, locs_frame, inv_map) outputs_per_gpu[file_path] = one_output targets_per_gpu[file_path] = one_target file_path_per_gpu[file_path] = file_path synchronize() logger.info("Total inference time: {}".format(np.sum(times))) # NOTE: `all_gather` will lead to CUDA out of memory # We use `scatter_gather` to save result of each process # in LOGS.DIR/tmp and will be cleared after gathering. outputs = scatter_gather(outputs_per_gpu) targets = scatter_gather(targets_per_gpu) file_paths = scatter_gather(file_path_per_gpu) if not is_main_process(): return None all_outputs = {k: v.numpy() for o in outputs for k, v in o.items()} all_targets = {k: v.numpy() for t in targets for k, v in t.items()} all_file_paths = {k: v for f in file_paths for k, v in f.items()} assert len(all_outputs) == len(dataset.all_files), \ '%d vs %d' % (len(all_outputs), len(dataset.all_files)) if cfg.LOGS.SAVE_RESULT is False: all_file_paths = None metrics = evaluate(dataset, all_outputs, all_targets, all_file_paths) return metrics
def _train_triplet(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] pos_m_neg_m_losses = [] pos_m_neg_e_losses = [] pos_e_neg_m_losses = [] pos_e_neg_e_losses = [] self.model.train() self.model.zero_grad() for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = TripletEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'concat_input': False } outputs = self.model(**inputs) pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] * outputs[:, 1:, :], dim=-1) if args.training_method == 'triplet_max_margin': # max-margin per_triplet_loss = F.relu( pos_neg_dot_prods[:, 1] # negative dot products - pos_neg_dot_prods[:, 0] # positive dot products + args.margin) elif args.training_method == 'triplet_bpr': # BPR per_triplet_loss = torch.sigmoid( pos_neg_dot_prods[:, 1] # negative dot products - pos_neg_dot_prods[:, 0] # positive dot products + args.margin) else: raise ValueError('unsupported training_method') # record triplet specific losses _detached_per_triplet_loss = per_triplet_loss.clone().detach( ).cpu() _mask = batch[0] < metadata.num_entities pos_m_neg_m_mask = ~_mask[:, 1] & ~_mask[:, 2] pos_m_neg_e_mask = ~_mask[:, 1] & _mask[:, 2] pos_e_neg_m_mask = _mask[:, 1] & ~_mask[:, 2] pos_e_neg_e_mask = _mask[:, 1] & _mask[:, 2] pos_m_neg_m_losses.extend( _detached_per_triplet_loss[pos_m_neg_m_mask].numpy( ).tolist()) pos_m_neg_e_losses.extend( _detached_per_triplet_loss[pos_m_neg_e_mask].numpy( ).tolist()) pos_e_neg_m_losses.extend( _detached_per_triplet_loss[pos_e_neg_m_mask].numpy( ).tolist()) pos_e_neg_e_losses.extend( _detached_per_triplet_loss[pos_e_neg_e_mask].numpy( ).tolist()) loss = torch.mean(per_triplet_loss) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'pos_m_neg_m_losses': pos_m_neg_m_losses, 'pos_m_neg_e_losses': pos_m_neg_e_losses, 'pos_e_neg_m_losses': pos_e_neg_m_losses, 'pos_e_neg_e_losses': pos_e_neg_e_losses }) if get_rank() == 0: pos_m_neg_m_losses = flatten( [d['pos_m_neg_m_losses'] for d in gathered_data]) pos_m_neg_e_losses = flatten( [d['pos_m_neg_e_losses'] for d in gathered_data]) pos_e_neg_m_losses = flatten( [d['pos_e_neg_m_losses'] for d in gathered_data]) pos_e_neg_e_losses = flatten( [d['pos_e_neg_e_losses'] for d in gathered_data]) losses = pos_m_neg_m_losses + pos_m_neg_e_losses + pos_e_neg_m_losses + pos_e_neg_e_losses pos_m_neg_m_loss = 0.0 if len( pos_m_neg_m_losses) == 0 else np.mean(pos_m_neg_m_losses) pos_m_neg_e_loss = 0.0 if len( pos_m_neg_e_losses) == 0 else np.mean(pos_m_neg_e_losses) pos_e_neg_m_loss = 0.0 if len( pos_e_neg_m_losses) == 0 else np.mean(pos_e_neg_m_losses) pos_e_neg_e_loss = 0.0 if len( pos_e_neg_e_losses) == 0 else np.mean(pos_e_neg_e_losses) loss = np.mean(losses) synchronize() return { 'embed_loss': loss, 'embed_num_examples': len(losses), 'embed_pos_m_neg_m_loss': pos_m_neg_m_loss, 'embed_pos_m_neg_e_loss': pos_m_neg_e_loss, 'embed_pos_e_neg_m_loss': pos_e_neg_m_loss, 'embed_pos_e_neg_e_loss': pos_e_neg_e_loss, 'embed_pos_m_neg_m_num_examples': len(pos_m_neg_m_losses), 'embed_pos_m_neg_e_num_examples': len(pos_m_neg_e_losses), 'embed_pos_e_neg_m_num_examples': len(pos_e_neg_m_losses), 'embed_pos_e_neg_e_num_examples': len(pos_e_neg_e_losses) } else: synchronize() return None
def train(self): args = self.args # set up data structures for choosing available negatives on-the-fly if args.clustering_domain == 'within_doc': self._neg_choosing_prep() else: raise NotImplementedError('xdoc not implemented yet') global_step = 0 log_return_dicts = [] logger.info('Starting training...') batch = None for epoch in range(args.num_train_epochs): logger.info( '********** [START] epoch: {} **********'.format(epoch)) num_batches = None if get_rank() == 0: data_iterator = iter(self.train_dataloader) num_batches = len(data_iterator) num_batches = broadcast(num_batches, src=0) logger.info('num_batches: {}'.format(num_batches)) for _ in trange(num_batches, desc='Epoch: {} - Batches'.format(epoch), disable=(get_rank() != 0 or args.disable_logging)): ### FIXME: hack for hyperparameter scheduling #if global_step > 400: # args.training_edges_considered = 'all' #if global_step % 200 == 199: # if get_rank() == 0: # self.embed_sub_trainer.save_model(global_step) # synchronize() # val_metrics = self.evaluate( # split='val', # suffix='checkpoint-{}'.format(global_step) # ) # synchronize() # exit() # get batch from rank0 and broadcast it to the other processes if get_rank() == 0: try: next_batch = next(data_iterator) # make sure the cluster_mx is sorted correctly _row, _col, _data = [], [], [] current_row = 0 ctr = 0 for r, d in sorted(zip(next_batch.row, next_batch.data)): if current_row != r: current_row = r ctr = 0 _row.append(r) _col.append(ctr) _data.append(d) ctr += 1 next_batch = coo_matrix((_data, (_row, _col)), shape=next_batch.shape) negs = self._choose_negs(next_batch) batch = (next_batch, negs) except StopIteration: batch = None batch = broadcast(batch, src=0) if batch is None: break # run train_step log_return_dicts.append(self.train_step(batch)) global_step += 1 # logging stuff for babysitting if global_step % args.logging_steps == 0: avg_return_dict = reduce(dict_merge_with, log_return_dicts) for stat_name, stat_value in avg_return_dict.items(): logger.info('Average %s: %s at global step: %s', stat_name, str(stat_value / args.logging_steps), str(global_step)) logger.info('Using {} edges for training'.format( args.training_edges_considered)) log_return_dicts = [] # refresh the knn index if args.knn_refresh_steps > 0 and global_step % args.knn_refresh_steps == 0: logger.info('Refreshing kNN index...') self.train_knn_index.refresh_index() logger.info('Done.') # save the model at the end of every epoch if get_rank() == 0: #self.embed_sub_trainer.save_model(global_step) self.concat_sub_trainer.save_model(global_step) synchronize() logger.info('********** [END] epoch: {} **********'.format(epoch)) # run full evaluation at the end of each epoch #if args.evaluate_during_training and epoch % 10 == 9: if args.evaluate_during_training: if args.do_train_eval: train_eval_metrics = self.evaluate( split='train', suffix='checkpoint-{}'.format(global_step)) if args.do_val: val_metrics = self.evaluate( split='val', suffix='checkpoint-{}'.format(global_step))
def compute_on_dataset_2stage(model, data_loader, device, logger): # two stage inference, for model with memory features. # first extract features and then do the inference cpu_device = torch.device("cpu") num_devices = get_world_size() dataset = data_loader.dataset if num_devices == 1: extra_args = {} else: rank = get_rank() extra_args = dict(desc="rank {}".format(rank)) loader_len = len(data_loader) person_feature_pool = MemoryPool() batch_info_list = [None]*loader_len logger.info("Stage 1: extracting clip features.") start_time = time.time() for i, batch in enumerate(tqdm(data_loader, **extra_args)): slow_clips, fast_clips, boxes, objects, extras, video_ids = batch slow_clips = slow_clips.to(device) fast_clips = fast_clips.to(device) boxes = [box.to(device) for box in boxes] objects = [None if (box is None) else box.to(device) for box in objects] movie_ids = [e["movie_id"] for e in extras] timestamps = [e["timestamp"] for e in extras] with torch.no_grad(): feature = model(slow_clips, fast_clips, boxes, objects, part_forward=0) person_feature = [ft.to(cpu_device) for ft in feature[0]] object_feature = [ft.to(cpu_device) for ft in feature[1]] # store person features into memory pool for movie_id, timestamp, p_ft, o_ft in zip(movie_ids, timestamps, person_feature, object_feature): person_feature_pool[movie_id, timestamp] = p_ft # store other information in list, for further inference batch_info_list[i] = (movie_ids, timestamps, video_ids, object_feature) # gather feature pools from different ranks synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Stage 1 time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) feature_pool = all_gather(person_feature_pool) all_feature_pool_p = MemoryPool() all_feature_pool_p.update_list(feature_pool) del feature_pool, person_feature_pool # do the inference results_dict = {} logger.info("Stage 2: predicting with extracted feature.") start_time = time.time() for movie_ids, timestamps, video_ids, object_feature in tqdm(batch_info_list, **extra_args): current_feat_p = [all_feature_pool_p[movie_id, timestamp].to(device) for movie_id, timestamp in zip(movie_ids, timestamps)] current_feat_o = [ft_o.to(device) for ft_o in object_feature] extras = dict( person_pool=all_feature_pool_p, movie_ids=movie_ids, timestamps=timestamps, current_feat_p=current_feat_p, current_feat_o=current_feat_o, ) with torch.no_grad(): output = model(None, None, None, None, extras=extras, part_forward=1) output = [o.to(cpu_device) for o in output] results_dict.update( {video_id: result for video_id, result in zip(video_ids, output)} ) synchronize() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Stage 2 time: {} ({} s / video per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) return results_dict
def eval_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging("stdout.log", 'w') # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank torch.cuda.set_device(args.gpu) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # build the supernet logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) model = comm.get_parallel_model(model, args.gpu) #local rank # define loss function (criterion) criterion = nn.CrossEntropyLoss().cuda() ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) assert args.resume #reloading model model.module.load_weights_from_pretrained_models(args.resume) if train_sampler: train_sampler.set_epoch(0) targeted_min_flops = args.evo_search.targeted_min_flops targeted_max_flops = args.evo_search.targeted_max_flops # run evolutionary search parent_popu = [] for idx in range(args.evo_search.parent_popu_size): if idx == 0: cfg = model.module.sample_min_subnet() else: cfg = model.module.sample_active_subnet_within_range( targeted_min_flops, targeted_max_flops) cfg['net_id'] = f'net_{idx % args.world_size}_evo_0_{idx}' parent_popu.append(cfg) pareto_global = {} for evo in range(args.evo_search.evo_iter): # partition the set of candidate sub-networks # and send them to each GPU for parallel evaluation # sub-networks to be evaluated on GPU {args.rank} my_subnets_to_be_evaluated = {} n_evaluated = len(parent_popu) // args.world_size * args.world_size for cfg in parent_popu[:n_evaluated]: if cfg['net_id'].startswith(f'net_{args.rank}_'): my_subnets_to_be_evaluated[cfg['net_id']] = cfg # aggregating all evaluation results eval_results = attentive_nas_eval.validate( my_subnets_to_be_evaluated, train_loader, val_loader, model, criterion, args, logger, ) # update the Pareto frontier # in this case, we search the best FLOPs vs. accuracy trade-offs for cfg in eval_results: f = round( cfg['flops'] / args.evo_search.step) * args.evo_search.step if f not in pareto_global or pareto_global[f]['acc1'] < cfg['acc1']: pareto_global[f] = cfg # next batch of sub-networks to be evaluated parent_popu = [] # mutate for idx in range(args.evo_search.mutate_size): while True: old_cfg = random.choice(list(pareto_global.values())) cfg = model.module.mutate_and_reset( old_cfg, prob=args.evo_search.mutate_prob) flops = model.module.compute_active_subnet_flops() if flops >= targeted_min_flops and flops <= targeted_max_flops: break cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_mutate_{idx}' parent_popu.append(cfg) # cross over for idx in range(args.evo_search.crossover_size): while True: cfg1 = random.choice(list(pareto_global.values())) cfg2 = random.choice(list(pareto_global.values())) cfg = model.module.crossover_and_reset(cfg1, cfg2) flops = model.module.compute_active_subnet_flops() if flops >= targeted_min_flops and flops <= targeted_max_flops: break cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_crossover_{idx}' parent_popu.append(cfg)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu args.batch_size_total = args.batch_size * args.world_size #rescale base lr args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max( 1, args.batch_size_total // 256)) # set random seed, make sure all random subgraph generated would be the same random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu: torch.cuda.manual_seed(args.seed) global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging(args.logging_save_path, 'w') logger.info( f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \ gpu per node {ngpus_per_node}, world size {args.world_size}" ) # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank args.local_rank = args.gpu torch.cuda.set_device(args.gpu) # build model logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) # use sync batchnorm if getattr(args, 'sync_bn', False): model.apply(lambda m: setattr(m, 'need_sync', True)) model = comm.get_parallel_model(model, args.gpu) #local rank logger.info(model) criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda( args.gpu) soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max, args.iw_clip).cuda(args.gpu) if not getattr(args, 'inplace_distill', True): soft_criterion = None ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) args.n_iters_per_epoch = len(train_loader) logger.info(f'building optimizer and lr scheduler, \ local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}' ) optimizer = build_optimizer(args, model) lr_scheduler = build_lr_scheduler(args, optimizer) # optionally resume from a checkpoint if args.resume: saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger) logger.info(args) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) args.curr_epoch = epoch logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0])) # train for one epoch acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \ soft_criterion=soft_criterion, lr_scheduler=lr_scheduler) if comm.is_master_process() or args.distributed: # validate supernet model validate(train_loader, val_loader, model, criterion, args) if comm.is_master_process(): # save checkpoints saver.save_checkpoint( args.checkpoint_save_path, model, optimizer, lr_scheduler, args, epoch, )
def train(): torch.cuda.set_device(args.local_rank) dist.init_process_group(backend="nccl", init_method="env://") synchronize() # create dataloader & network & optimizer model, model_fn_decorator = create_model(cfg) init_weights(model, init_type='kaiming') # model.to('cuda') model.cuda() model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) root_result_dir = args.output_dir os.makedirs(root_result_dir, exist_ok=True) log_file = os.path.join(root_result_dir, "log_train.txt") logger = create_logger(log_file, get_rank()) logger.info("**********************Start logging**********************") # log to file gpu_list = os.environ[ 'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys( ) else 'ALL' logger.info("CUDA_VISIBLE_DEVICES=%s" % gpu_list) for key, val in vars(args).items(): logger.info("{:16} {}".format(key, val)) logger.info("***********************config infos**********************") for key, val in vars(cfg).items(): logger.info("{:16} {}".format(key, val)) # log tensorboard if get_rank() == 0: tb_log = SummaryWriter( log_dir=os.path.join(root_result_dir, "tensorboard")) else: tb_log = None train_loader, test_loader = create_dataloader(logger) optimizer = create_optimizer(model) # load checkpoint if it is possible start_epoch = it = best_res = 0 last_epoch = -1 if args.ckpt is not None: pure_model = model.module if isinstance( model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)) else model it, start_epoch, best_res = load_checkpoint(pure_model, optimizer, args.ckpt, logger) last_epoch = start_epoch + 1 lr_scheduler = create_scheduler(model, optimizer, total_steps=len(train_loader) * args.epochs, last_epoch=last_epoch) if cfg.DATASET.DF_USED: criterion = Total_loss(boundary=cfg.DATASET.BOUNDARY) else: criterion = nn.CrossEntropyLoss() # start training logger.info('**********************Start training**********************') ckpt_dir = os.path.join(root_result_dir, "ckpt") os.makedirs(ckpt_dir, exist_ok=True) trainer = train_utils.Trainer(model, model_fn=model_fn_decorator(), criterion=criterion, optimizer=optimizer, ckpt_dir=ckpt_dir, lr_scheduler=lr_scheduler, model_fn_eval=model_fn_decorator(), tb_log=tb_log, logger=logger, eval_frequency=1, grad_norm_clip=cfg.TRAIN.GRAD_NORM_CLIP, cfg=cfg) trainer.train(start_it=it, start_epoch=start_epoch, n_epochs=args.epochs, train_loader=train_loader, test_loader=test_loader, ckpt_save_interval=args.ckpt_save_interval, lr_scheduler_each_iter=False, best_res=best_res) logger.info('**********************End training**********************')