def eval(self, iteration=-1, summary_writer=None): start = time.time() all_preds = [] all_labels = [] with torch.no_grad(): # self.derenderer.eval() val_metrics = MetricLogger(delimiter=" ") val_loss_logger = MetricLogger(delimiter=" ") for i, inputs in enumerate(self.val_loader, iteration): # data_time = time.time() - last_batch_time if torch.cuda.is_available(): inputs = to_cuda(inputs) output = self.derenderer(inputs) loss_dict = gather_loss_dict(output) val_loss_logger.update(**loss_dict) summary_writer.add_scalars("val_non_smooth", val_loss_logger.last_item, i) all_preds.append({k:v.cpu().numpy() for k,v in output["output"].items()}) all_labels.append({k:v.cpu().numpy() for k,v in inputs["attributes"].items()}) # all_labels = self.attributes.cat_by_key(all_labels, inputs["attributes"]) # all_preds = self.attributes.cat_by_key(all_preds, output['output']) # batch_time = time.time() - last_batch_time # val_metrics.update(time=batch_time, data=data_time) if time.time() - start > self.cfg.SOLVER.VALIDATION_MAX_SECS: break all_preds, all_labels = map(lambda l: {k: np.concatenate([a[k] for a in l]) for k in l[0].keys()}, [all_preds, all_labels]) # all_preds = {k: np.concatenate([a[k] for a in all_preds]) for k in all_preds[0].keys()} # all_labels = {k: np.concatenate([a[k] for a in all_labels]) for k in all_labels[0].keys()} err_dict = self.attributes.pred_error(all_preds, all_labels) val_metrics.update(**err_dict) log.info(val_metrics.delimiter.join(["VALIDATION", "iter: {iter}", "{meters}"]) .format(iter=iteration, meters=str(val_metrics))) log.info(val_metrics.delimiter.join(["VALIDATION", "iter: {iter}", "{meters}"]) .format(iter=iteration, meters=str(val_loss_logger))) if summary_writer is not None: summary_writer.add_scalars("val_error", val_metrics.mean, iteration) summary_writer.add_scalars("val", val_loss_logger.mean, iteration) # self.derenderer.train() return err_dict
def compute_mask(self, attributes, term): attributes = to_cuda(attributes) if term in self.valid_map: all_masks = [] for val_term, valid_els in self.valid_map[term].items(): categories_map = eval("self.{}_map".format(val_term)) valid_categories = torch.LongTensor( [categories_map[el] for el in valid_els]) val_vector = attributes[val_term].view(-1, 1).repeat( 1, len(valid_categories)).cuda() valid_categories = valid_categories.repeat(len(val_vector), 1).cuda() mask = ((val_vector - valid_categories).abs().min( dim=1).values == 0).float() all_masks.append(mask) return reduce(lambda x, y: x * y, all_masks) else: return torch.ones(len(attributes[term]), dtype=torch.float).cuda()
def write_with_inferred_attributes(cfg, split, attributes_key): timer = CodeTimer( "adding inferred attributes split:{}, attributes_key:{}".format( split, attributes_key)) module_cfg = os.path.join(cfg.TRAINED_DERENDER.EXP_DIR, "cfg.yaml") module_cfg = load_cfg_from_file(module_cfg) module_cfg.MODEL.WEIGHTS = cfg.TRAINED_DERENDER.ATTRIBUTES_WEIGHTS_MAP[ attributes_key] module_cfg.DATALOADER.OBJECTS_PER_BATCH = 1000 if cfg.BASE_NAME == "intphys" else 450 module_cfg.DATALOADER.NUM_WORKERS = 8 if cfg.BASE_NAME == "adept" else module_cfg.DATALOADER.NUM_WORKERS if cfg.DEBUG: module_cfg.DATALOADER.NUM_WORKERS = 0 module_cfg.DEBUG = True module_cfg.DATALOADER.OBJECTS_PER_BATCH = 50 predictor = DerenderPredictor(module_cfg) # if not cfg.DEBUG: # gpu_ids = [_ for _ in range(torch.cuda.device_count())] # predictor.derenderer = torch.nn.parallel.DataParallel(predictor.derenderer, gpu_ids) dataset_name, standard_format_json_file = get_dataset_name_and_json( cfg, split) dataset = DatasetCatalog.get(dataset_name) required_fields = [ "pred_box" ] if cfg.TRAINED_DERENDER.USE_INFERRED_BOXES else ["bbox"] filtered_idx, \ mapped_dataset = image_based_to_annotation_based(dataset, required_fields) mapped_dataset = DatasetFromList(mapped_dataset, copy=False) mapper = DerenderMapper(cfg.TRAINED_DERENDER.USE_INFERRED_BOXES, predictor.attributes, for_inference=True, use_depth=cfg.TRAINED_DERENDER.USE_DEPTH) mapped_dataset = MapDataset(mapped_dataset, mapper) data_loader = DataLoader( dataset=mapped_dataset, batch_size=module_cfg.DATALOADER.OBJECTS_PER_BATCH, num_workers=module_cfg.DATALOADER.NUM_WORKERS, shuffle=False) fil_pointer = 0 with torch.no_grad(): for inputs in data_loader: inputs = to_cuda(inputs) outputs = predictor(inputs) batch_size = list(outputs.values())[0].shape[0] for oix, (img_idx, an_idx) in zip( range(batch_size), filtered_idx[fil_pointer:fil_pointer + batch_size]): dataset[img_idx]["annotations"][an_idx][attributes_key] = \ {k: v[oix].item() for k, v in outputs.items()} # {k: v[oix].item() if v[oix].size == 1 # else [float(el) for el in v[oix]] # for k,v in outputs.items()} fil_pointer = fil_pointer + batch_size dataset = [fix_for_serialization(d) for d in dataset] with open(standard_format_json_file, "w") as f: json.dump(dataset, f, indent=4) timer.done()
def train(self, log_flag=True): train_metrics = MetricLogger(delimiter=" ") summary_writer = SummaryWriter(log_dir=os.path.join(self.output_dir, "summary")) self.derenderer.train() # Initialize timing timers = create_new_timer() done = False while not done: for iteration, inputs in enumerate(self.train_loader, self.start_iteration): iter_time = time.time() data_time = iter_time - timers.batch if torch.cuda.is_available(): inputs = to_cuda(inputs) output = self.derenderer(inputs) loss_dict = gather_loss_dict(output) loss = loss_dict['loss'] # loss = sum([loss_dict[term] for term in ['x', 'y', 'z']]) if torch.isnan(loss).any(): raise Nan_Exception() train_metrics.update(**loss_dict) summary_writer.add_scalars("train_non_smooth", train_metrics.last_item, iteration) batch_time = iter_time - timers.batch timers.batch = iter_time train_metrics.update(time=batch_time, data=data_time) eta_seconds = timers.start + self.cfg.SOLVER.MAX_TIME_SECS - iter_time eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if (iter_time - timers.log > self.cfg.SOLVER.PRINT_METRICS_TIME and log_flag): timers.log = iter_time log.info(train_metrics.delimiter.join(["eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}"]).format( eta=eta_string, iter=iteration, meters=str(train_metrics), lr=self.optimizer.param_groups[0]["lr"], memory=proc_id.memory_info().rss / 1e9) ) summary_writer.add_scalars("train", train_metrics.mean, iteration) if iter_time - timers.checkpoint > self.cfg.SOLVER.CHECKPOINT_SECS: #iteration % checkpoint_period == 0: timers.checkpoint = iter_time self.checkpointer.save("model_{:07d}".format(iteration)) if iter_time - timers.tensorboard > self.cfg.SOLVER.TENSORBOARD_SECS or self.cfg.DEBUG: timers.tensorboard = iter_time summary_writer.add_scalars("train", train_metrics.mean, iteration) if iter_time - timers.start > self.cfg.SOLVER.MAX_TIME_SECS: log.info("finished training loop in {}".format(iter_time-timers.start)) done = True break if iter_time - timers.validation > self.cfg.SOLVER.VALIDATION_SECS: err_dict = self.eval(iteration, summary_writer) timers.validation = time.time() loss.backward() self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() log.info("******* epoch done after {} *********".format(time.time() - timers.epoch)) timers.epoch = time.time() self.start_iteration = iteration err_dict = self.eval(iteration, summary_writer) self.checkpointer.save("model_{:07d}".format(iteration)) summary_writer.close() return err_dict
def train(cfg, args): train_set = DatasetCatalog.get(cfg.DATASETS.TRAIN, args) val_set = DatasetCatalog.get(cfg.DATASETS.VAL, args) train_loader = DataLoader(train_set, cfg.SOLVER.IMS_PER_BATCH, num_workers=cfg.DATALOADER.NUM_WORKERS, shuffle=True) val_loader = DataLoader(val_set, cfg.SOLVER.IMS_PER_BATCH, num_workers=cfg.DATALOADER.NUM_WORKERS, shuffle=True) gpu_ids = [_ for _ in range(torch.cuda.device_count())] model = build_model(cfg) model.to("cuda") model = torch.nn.parallel.DataParallel( model, gpu_ids) if not args.debug else model logger = logging.getLogger("train_logger") logger.info("Start training") train_metrics = MetricLogger(delimiter=" ") max_iter = cfg.SOLVER.MAX_ITER output_dir = cfg.OUTPUT_DIR optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) checkpointer = Checkpointer(model, optimizer, scheduler, output_dir, logger) start_iteration = checkpointer.load() if not args.debug else 0 checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD validation_period = cfg.SOLVER.VALIDATION_PERIOD summary_writer = SummaryWriter(log_dir=os.path.join(output_dir, "summary")) visualizer = train_set.visualizer(cfg.VISUALIZATION)(summary_writer) model.train() start_training_time = time.time() last_batch_time = time.time() for iteration, inputs in enumerate(cycle(train_loader), start_iteration): data_time = time.time() - last_batch_time iteration = iteration + 1 scheduler.step() inputs = to_cuda(inputs) outputs = model(inputs) loss_dict = gather_loss_dict(outputs) loss = loss_dict["loss"] train_metrics.update(**loss_dict) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - last_batch_time last_batch_time = time.time() train_metrics.update(time=batch_time, data=data_time) eta_seconds = train_metrics.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( train_metrics.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}" ]).format(eta=eta_string, iter=iteration, meters=str(train_metrics), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0)) summary_writer.add_scalars("train", train_metrics.mean, iteration) if iteration % 100 == 0: visualizer.visualize(inputs, outputs, iteration) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration)) if iteration % validation_period == 0: with torch.no_grad(): val_metrics = MetricLogger(delimiter=" ") for i, inputs in enumerate(val_loader): data_time = time.time() - last_batch_time inputs = to_cuda(inputs) outputs = model(inputs) loss_dict = gather_loss_dict(outputs) val_metrics.update(**loss_dict) batch_time = time.time() - last_batch_time last_batch_time = time.time() val_metrics.update(time=batch_time, data=data_time) if i % 20 == 0 or i == cfg.SOLVER.VALIDATION_LIMIT: logger.info( val_metrics.delimiter.join([ "VALIDATION", "eta: {eta}", "iter: {iter}", "{meters}" ]).format(eta=eta_string, iter=iteration, meters=str(val_metrics))) if i == cfg.SOLVER.VALIDATION_LIMIT: summary_writer.add_scalars("val", val_metrics.mean, iteration) break if iteration == max_iter: break checkpointer.save("model_{:07d}".format(max_iter)) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def train(self): # save new configuration with open(os.path.join(self.output_dir, "cfg.yaml"), 'w') as f: x = self.cfg.dump(indent=4) f.write(x) log.info(f'New training run with configuration:\n{self.cfg}\n\n') train_metrics = MetricLogger(delimiter=" ") summary_writer = SummaryWriter(log_dir=os.path.join(self.output_dir, "summary")) self.model.train() timers = create_new_timer() # Initialize timing done = False while not done: for iteration, inputs in enumerate(self.train_loader, self.start_iteration): iter_time = time.time() data_time = iter_time - timers.batch inputs = to_cuda(inputs) out = self.model(inputs) loss_dict = out['loss_dict'] loss = loss_dict["loss"] if torch.isnan(loss).any(): raise Nan_Exception() train_metrics.update(**loss_dict) summary_writer.add_scalars("train_non_smooth", train_metrics.last_item, iteration) batch_time = iter_time - timers.batch timers.batch = iter_time train_metrics.update(time=batch_time, data=data_time) eta_seconds = timers.start + self.cfg.SOLVER.MAX_TIME_SECS - iter_time eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if (iter_time - timers.log > self.cfg.SOLVER.PRINT_METRICS_TIME): timers.log = iter_time log.info(train_metrics.delimiter.join(["eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}"]).format( eta=eta_string, iter=iteration, meters=str(train_metrics), lr=self.optimizer.param_groups[0]["lr"], memory=proc_id.memory_info().rss / 1e9) ) summary_writer.add_scalars("train", train_metrics.mean, iteration) if iter_time - timers.checkpoint > self.cfg.SOLVER.CHECKPOINT_SECS: # iteration % checkpoint_period == 0: timers.checkpoint = iter_time self.checkpointer.save("model_{:07d}".format(iteration)) if iter_time - timers.tensorboard > self.cfg.SOLVER.TENSORBOARD_SECS or self.cfg.DEBUG: timers.tensorboard = iter_time summary_writer.add_scalars("train", train_metrics.mean, iteration) if iter_time - timers.start > self.cfg.SOLVER.MAX_TIME_SECS: log.info("finished training loop in {}".format(iter_time - timers.start)) done = True break if iter_time - timers.validation > self.cfg.SOLVER.VALIDATION_SECS: err_dict = self.eval(iteration, summary_writer) timers.validation = time.time() loss.backward() self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() log.info("******* epoch done after {} *********".format(time.time() - timers.epoch)) timers.epoch = time.time() self.start_iteration = iteration
def eval(self, iteration, summary_writer): start = time.time() all_preds = [] all_labels = [] evals = [] with torch.no_grad(): self.model.eval() # self.derenderer.eval() val_metrics = MetricLogger(delimiter=" ") val_loss_logger = MetricLogger(delimiter=" ") for i, inputs in enumerate(self.val_loader, iteration): # data_time = time.time() - last_batch_time if torch.cuda.is_available(): inputs = to_cuda(inputs) output = self.model(inputs, match=True) loss_dict = output["loss_dict"] is_possible = inputs['is_possible'] magic_penalty = output['magic_penalty'] for i in range(len(magic_penalty)): frame = {} frame['is_possible'] = bool(is_possible[i]) frame['inverse_likelihood'] = float(magic_penalty[i]) evals.append(frame) # target = inputs['targets'] # output = output['output'] # is_possible = inputs['is_possible'] # loc_x_gt = target['location_x'] # loc_y_gt = target['location_y'] # loc_z_gt = target['location_z'] # output_x = output['location_x'].squeeze() # output_y = output['location_y'].squeeze() # output_z = output['location_z'].squeeze() # existance = target['existance'][:, 1:] # loss_trans_x = torch.pow(output_x - loc_x_gt[:, 1:], 2) * existance # loss_trans_y = torch.pow(output_y - loc_y_gt[:, 1:], 2) * existance # loss_trans_z = torch.pow(output_z - loc_z_gt[:, 1:], 2) * existance # loss_trans_x = loss_trans_x.mean(dim=2).mean(dim=1) # loss_trans_y = loss_trans_y.mean(dim=2).mean(dim=1) # loss_trans_z = loss_trans_z.mean(dim=2).mean(dim=1) # loss = loss_trans_z + loss_trans_y + loss_trans_x # energy_pos = loss[is_possible] # energy_neg = loss[~is_possible] # energy_pos = energy_pos.detach().cpu().numpy() # energy_neg = energy_neg.detach().cpu().numpy() # for i in range(energy_pos.shape[0]): # frame = {} # frame['is_possible'] = True # frame['likelihood'] = float(energy_pos[i]) # evals.append(frame) # for i in range(energy_neg.shape[0]): # frame = {} # frame['is_possible'] = False # frame['likelihood'] = float(energy_neg[i]) # evals.append(frame) # print("possible: ", energy_pos.mean()) # print("not possible: ", energy_neg.mean()) val_loss_logger.update(**loss_dict) # summary_writer.add_scalars("val_non_smooth", val_loss_logger.last_item, i) # all_preds.append({k: v.cpu().numpy() for k, v in output["output"].items()}) # all_labels.append({k: v.cpu().numpy() for k, v in inputs["attributes"].items()}) # all_labels = self.attributes.cat_by_key(all_labels, inputs["attributes"]) # all_preds = self.attributes.cat_by_key(all_preds, output['output']) # batch_time = time.time() - last_batch_time # val_metrics.update(time=batch_time, data=data_time) # if time.time() - start > self.cfg.SOLVER.VALIDATION_MAX_SECS: # raise Val_Too_Long # all_preds, all_labels = map(lambda l: {k: np.concatenate([a[k] for a in l]) for k in l[0].keys()}, # [all_preds, all_labels]) # all_preds = {k: np.concatenate([a[k] for a in all_preds]) for k in all_preds[0].keys()} # all_labels = {k: np.concatenate([a[k] for a in all_labels]) for k in all_labels[0].keys()} # err_dict = self.attributes.pred_error(all_preds, all_labels) # val_metrics.update(**err_dict) # log.info(val_metrics.delimiter.join(["VALIDATION", "iter: {iter}", "{meters}"]) # .format(iter=iteration, meters=str(val_metrics))) log.info(val_metrics.delimiter.join(["VALIDATION", "iter: {iter}", "{meters}"]) .format(iter=iteration, meters=str(val_loss_logger))) if summary_writer is not None: # summary_writer.add_scalars("val_error", val_metrics.mean, iteration) summary_writer.add_scalars("val", val_loss_logger.mean, iteration) # self.derenderer.train() json.dump(evals, open("output.json", "w")) self.model.train() return None