def eval_final(): """ Evaluated model on test set in an extended way: computes estimates over multiple samples of point clouds and stores predictions """ model.eval() acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes']) collected, predictions = defaultdict(list), {} # collect predictions over multiple sampling seeds for ss in range(args.test_multisamp_n): test_dataset_ss = create_dataset(args, ss)[1] loader = torch.utils.data.DataLoader(test_dataset_ss, batch_size=1, collate_fn=spg.eccpc_collate, num_workers=args.nworkers) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:, 0], targets[:, 2:], targets[:, 1:].sum( 1 ).float( ) embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs = model.ecc(embeddings) fname = clouds_data[0][0][:clouds_data[0][0].rfind('.')] collected[fname].append( (outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy())) # aggregate predictions (mean) for fname, lst in collected.items(): o_cpu, t_cpu, tvec_cpu = list(zip(*lst)) if args.test_multisamp_n > 1: o_cpu = np.mean(np.stack(o_cpu, 0), 0) else: o_cpu = o_cpu[0] t_cpu, tvec_cpu = t_cpu[0], tvec_cpu[0] predictions[fname] = np.argmax(o_cpu, 1) o_cpu, t_cpu, tvec_cpu = filter_valid(o_cpu, t_cpu, tvec_cpu) if t_cpu.size > 0: acc_meter.add(o_cpu, t_cpu) #Changed by Arthur# # *** WARNING: confusion matrix is commented for the ONERD because it doesn't have any label # confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu,1)) per_class_iou = {} perclsiou = confusion_matrix.get_intersection_union_per_class() for c, name in dbinfo['inv_class_map'].items(): per_class_iou[name] = perclsiou[c] return meter_value(acc_meter), confusion_matrix.get_overall_accuracy( ), confusion_matrix.get_average_intersection_union( ), per_class_iou, predictions, confusion_matrix.get_mean_class_accuracy( ), confusion_matrix.confusion_matrix
def train(): """ Trains for one epoch """ model.train() loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=spg.eccpc_collate, num_workers=args.nworkers, shuffle=True, drop_last=True) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) loss_meter = tnt.meter.AverageValueMeter() acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes']) t0 = time.time() # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): t_loader = 1000*(time.time()-t0) model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,0], targets[:,2:], targets[:,1:].sum(1) if args.cuda: label_mode, label_vec, segm_size = label_mode_cpu.cuda(), label_vec_cpu.float().cuda(), segm_size_cpu.float().cuda() else: label_mode, label_vec, segm_size = label_mode_cpu, label_vec_cpu.float(), segm_size_cpu.float() optimizer.zero_grad() t0 = time.time() embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs = model.ecc(embeddings) loss = nn.functional.cross_entropy(outputs, Variable(label_mode)) loss.backward() ptnCloudEmbedder.bw_hook() if args.grad_clip>0: for p in model.parameters(): p.grad.data.clamp_(-args.grad_clip, args.grad_clip) optimizer.step() t_trainer = 1000*(time.time()-t0) #loss_meter.add(loss.data[0]) # pytorch 0.3 loss_meter.add(loss.item()) # pytorch 0.4 o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu,1)) logging.debug('Batch loss %f, Loader time %f ms, Trainer time %f ms.', loss.data[0], t_loader, t_trainer) t0 = time.time() return acc_meter.value()[0], loss_meter.value()[0], confusion_matrix.get_overall_accuracy(), confusion_matrix.get_average_intersection_union()
def eval(epoch): start = timer() """ Evaluated model on test set """ model.eval() loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, collate_fn=spg.eccpc_collate, num_workers=args.nworkers) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) acc_meter2 = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(5) # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,0], targets[:,2:], targets[:,1:].sum(1).float() embeddings = ptnCloudEmbedder.run(model, *clouds_data[0:4]) outputs = model.ecc(embeddings) mixed_node=np.where(np.argmax(np.array(outputs.data),axis=1)==5) Nomixed_node=np.where(np.argmax(np.array(outputs.data),axis=1)!=5) if epoch>args.fine_seg_epo and len(mixed_node[0])>0: for i in range(len(mixed_node[0])): fname='/home/data2/qc/large_scalepcss/learning/datasets/tanker/parsed/'+clouds_data[0][mixed_node[0][i]].split('.')[0] + '.h5' fname_G=clouds_data[0][mixed_node[0][i]].split('.')[1] hf = h5py.File(fname,'r') P = hf[fname_G] if np.shape(P)[0]>args.fine_seg_point_num: tempt1 = np.array(P[:,:13]).T tempt = torch.from_numpy(tempt1.reshape((1,np.shape(tempt1)[0],np.shape(tempt1)[1]))) label_modebran = torch.from_numpy(P[:,-1]).long() if args.cuda: tempt = tempt.cuda() label_modebran = label_modebran.cuda() outputsbran = model.FineModule(Variable(tempt.float(), requires_grad=model.training, volatile=not model.training)) _outputsbran,_label_modebran=filter_valid(outputsbran.data.cpu().numpy(),label_modebran.cpu().numpy()) acc_meter2.add(_outputsbran,_label_modebran) confusion_matrix.count_predicted_batch_branch(_outputsbran, _label_modebran) o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) if t_cpu.size > 0: if epoch>10 & len(mixed_node[0])>0: acc_meter.add(o_cpu[Nomixed_node[0],:], t_cpu[[Nomixed_node[0]]]) else: acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu[[Nomixed_node[0]]], np.argmax(o_cpu[Nomixed_node[0],:],1)) end = timer() return meter_value(acc_meter), meter_value(acc_meter2),confusion_matrix.get_overall_accuracy(), confusion_matrix.get_average_intersection_union(), confusion_matrix.get_mean_class_accuracy(),confusion_matrix.get_confusion_matrix(), str(end - start)
def learningPhase(args, model,loader, w, optimizer, metrics): #training function for one epoch of given loader model.train() # model in training mode #initialize metric container loss_meter = tnt.meter.AverageValueMeter() cm = met.ConfusionMatrix(len(args.c), args.c,args.nodata) #loop through batch given by data reader, unfold tuple and drop file name (the last param) for batch_ndx, (imgs,gt,__) in enumerate(tqdm(loader)): optimizer.zero_grad() #put gradient to zero #if GPU, load batch on it if args.cuda : batch_tensor = imgs.cuda() else : batch_tensor=imgs #generate prediction prediction = model(batch_tensor) prediction = prediction.cpu() #switch it on CPU for calculation #get & save batch loss and do backward loss = nn.functional.cross_entropy(prediction,gt,weight=w, ignore_index=args.nodata) loss.backward() loss_meter.add(loss.item()) #clamp gradient to avoid some weight get high gradient actualization for p in model.parameters(): p.grad.data.clamp(-1,1) #actualize weight optimizer.step() #calculate metrics if epoch_number % args.mem=0 if metrics: for i in range(prediction.size()[0]): pred=prediction[i].argmax(0).squeeze() cm.add_batch(gt[i].numpy(), pred.numpy()) #free memory del imgs del gt del batch_tensor del prediction return cm, loss_meter.value()[0]
def eval(): """ Evaluated model on test set """ model.eval() loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, collate_fn=spg.eccpc_collate_test, num_workers=args.nworkers, drop_last=False) acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix( dbinfo['classes'], ignore_label=args.metric_ignore_class) test_time = time.time() # iterate over dataset in batches for bidx, (targets, GIs, clouds_data, clouds_orig, edges_for_ext, fnames) in enumerate(loader): model.ecc.set_info(GIs, args.cuda) # label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,0], targets[:,2:], targets[:,1:].sum(1).float() weak_label_mode_cpu, label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:, 0], targets[:, 1], targets[:, 2:], targets[:, 2:].sum( 1 ) embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs, _ = model.ecc(embeddings) o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) if t_cpu.size > 0: acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu, 1)) print('{}/{}-{} outputs: {}, acc: {}, macc: {}'.format( bidx, len(loader), fnames[0], outputs.shape, confusion_matrix.get_overall_accuracy(), confusion_matrix.get_mean_class_accuracy())) return meter_value(acc_meter), confusion_matrix.get_overall_accuracy( ), confusion_matrix.get_mean_class_accuracy( ), confusion_matrix.get_average_intersection_union( ), time.time() - test_time
def evaluate(test_dataset): confusion_matrix = metrics.ConfusionMatrix(model_config.n_classes) for (batch, (speaker, utterance, emotion)) in enumerate(test_dataset): speaker = tf.squeeze(speaker) # (batch_size, dial_len) emotion = tf.squeeze(emotion) # (batch_size, dial_len) mask = tf.cast(tf.math.not_equal(utterance, 0), dtype=tf.float32) utterance = encode_utterance(utterance) predictions = model(utterance, False, mask) # (batch_size, dial_len, n_classes) sample_weight = tf.math.not_equal(tf.math.reduce_sum(mask, axis=2), 0) sample_weight = tf.cast(sample_weight, dtype=tf.float32) pred_emotion = tf.math.argmax(predictions, axis=2) confusion_matrix(emotion, pred_emotion, sample_weight=sample_weight) return metrics.classification_report(confusion_matrix)
def eval(): """ Evaluated model on test set """ model.eval() loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, collate_fn=spg.eccpc_collate, num_workers=args.nworkers) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes']) # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:, 0], targets[:, 2:], targets[:, 1:].sum( 1 ).float( ) embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs = model.ecc(embeddings) o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) if t_cpu.size > 0: acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu, 1)) return meter_value(acc_meter), confusion_matrix.get_overall_accuracy( ), confusion_matrix.get_average_intersection_union( ), confusion_matrix.get_mean_class_accuracy()
def evalutionPhase(args, model,loader,w): #evaluation function for one epoch of given loader model.eval() # model in training mode #initialize metric container loss_meter = tnt.meter.AverageValueMeter() cm = met.ConfusionMatrix(len(args.c), args.c,args.nodata) #loop through batch given by data reader, unfold tuple and drop file name (the last param) for batch_ndx, (imgs,gt,__) in enumerate(tqdm(loader)): #if GPU, load batch on it if args.cuda : batch_tensor = imgs.cuda() else : batch_tensor=imgs #generate prediction prediction = model(batch_tensor) prediction = prediction.cpu() #switch it on CPU for calculation #get & save batch loss loss = nn.functional.cross_entropy(prediction,gt,weight=w, ignore_index=args.nodata) loss_meter.add(loss.item()) #calculate metrics for i in range(prediction.size()[0]): pred=prediction[i].argmax(0).squeeze() cm.add_batch(gt[i].numpy(), pred.numpy()) #free memory del imgs del gt del batch_tensor del prediction return cm, loss_meter.value()[0]
def train(config, model_dir, writer): """ Function train and evaluate a part segmentation model for the Shapenet dataset. The training parameters are specified in the config file (for more details see config/config.py). :param config: Dictionary with configuration paramters :param model_dir: Checkpoint save directory :param writer: Tensorboard SummaryWritter object """ phases = ['train', 'test'] # phases = ['test', 'train'] datasets, dataloaders, num_classes = ds.get_s3dis_dataloaders( root_dir=config['root_dir'], phases=phases, batch_size=config['batch_size'], category=config['category'], augment=config['augment']) # add number of classes to config config['num_classes'] = num_classes # we now set GPU training parameters # if the given index is not available then we use index 0 # also when using multi gpu we should specify index 0 if config['gpu_index'] + 1 > torch.cuda.device_count( ) or config['multi_gpu']: config['gpu_index'] = 0 logging.info('Using GPU cuda:{}, script PID {}'.format( config['gpu_index'], os.getpid())) if config['multi_gpu']: logging.info('Training on multi-GPU mode with {} devices'.format( torch.cuda.device_count())) device = torch.device('cuda:{}'.format(config['gpu_index'])) # we load the model defined in the config file # todo: now the code is IO bound. No matter which network we use, it is similar speed. model = res.sfc_resnet_8(in_channels=config['in_channels'], num_classes=config['num_classes'], kernel_size=config['kernel_size'], channels=config['channels'], use_tnet=config['use_tnet'], n_points=config['n_points']).to(device) logging.info('the number of params is {: .2f} M'.format( utl.count_model_params(model) / (1e6))) # if use multi_gpu then convert the model to DataParallel if config['multi_gpu']: model = nn.DataParallel(model) # create optimizer, loss function, and lr scheduler optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=1e-4) criterion = nn.CrossEntropyLoss().to(device) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=config['lr_decay'], patience=config['lr_patience'], verbose=True) # verbose. recommended to use. logging.info('Config {}'.format(config)) logging.info( 'TB logs and checkpoint will be saved in {}'.format(model_dir)) utl.dump_config_details_to_tensorboard(writer, config) # create metric trackers: we track lass, class accuracy, and overall accuracy trackers = { x: { 'loss': metrics.LossMean(), 'cm': metrics.ConfusionMatrix(num_classes=int(config['num_classes'])) } for x in phases } # create initial best state object best_state = { 'config': config, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() if scheduler else None, 'train_loss': float('inf'), 'test_loss': float('inf'), 'train_mIoU': 0.0, 'test_mIoU': 0.0, 'convergence_epoch': 0, 'num_epochs_since_best_acc': 0 } # now we train! for epoch in range(config['max_epochs']): for phase in phases: if phase == 'train': model.train() else: model.eval() # reset metrics trackers[phase]['loss'].reset() trackers[phase]['cm'].reset() # use tqdm to show progress and print message # this is for loadding our new data format for step_number, batchdata in enumerate( tqdm(dataloaders[phase], desc='[{}/{}] {} '.format(epoch + 1, config['max_epochs'], phase))): data = torch.cat((batchdata.pos, batchdata.x), dim=2).transpose(1, 2).to(device, dtype=torch.float) label = batchdata.y.to(device, dtype=torch.long) # should we release the memory? # todo: add data augmentation # compute gradients on train only with torch.set_grad_enabled(phase == 'train'): out = model(data) loss = criterion(out, label) if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() # now we update metrics trackers[phase]['loss'].update(average_loss=loss, batch_size=data.size(0)) trackers[phase]['cm'].update(y_true=label, y_logits=out) # compare with my metrics epoch_loss = trackers[phase]['loss'].result() epoch_iou = trackers[phase]['cm'].result(metric='iou').mean() # we update our learning rate scheduler if loss does not improve if phase == 'train' and scheduler: scheduler.step(epoch_loss) writer.add_scalar('params/lr', optimizer.param_groups[0]['lr'], epoch + 1) # log current results and dump in Tensorboard logging.info( '[{}/{}] {} Loss: {:.2e}. mIOU {:.4f} \t best testing mIOU {:.4f}' .format(epoch + 1, config['max_epochs'], phase, epoch_loss, epoch_iou, best_state['test_mIoU'])) writer.add_scalar('loss/epoch_{}'.format(phase), epoch_loss, epoch + 1) writer.add_scalar('mIoU/epoch_{}'.format(phase), epoch_iou, epoch + 1) # after each epoch we update best state values as needed # first we save our state when we get better test accuracy test_iou = trackers['test']['cm'].result(metric='iou').mean() if best_state['test_mIoU'] > test_iou: best_state['num_epochs_since_best_acc'] += 1 else: logging.info( 'Got a new best model with iou {:.4f}'.format(test_iou)) best_state = { 'config': config, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() if scheduler else None, 'train_loss': trackers['train']['loss'].result(), 'test_loss': trackers['test']['loss'].result(), 'train_mIoU': trackers['train']['cm'].result(metric='iou').mean(), 'test_mIoU': test_iou, 'convergence_epoch': epoch + 1, 'num_epochs_since_best_acc': 0 } file_name = os.path.join(model_dir, 'best_state.pth') torch.save(best_state, file_name) logging.info('saved checkpoint in {}'.format(file_name)) # we check for early stopping when we have trained a min number of epochs if epoch >= config['min_epochs'] and best_state[ 'num_epochs_since_best_acc'] >= config['early_stopping']: logging.info('Accuracy did not improve for {} iterations!'.format( config['early_stopping'])) logging.info('[Early stopping]') break utl.dump_best_model_metrics_to_tensorboard(writer, phases, best_state) logging.info('************************** DONE **************************')
# pred.shape == (batch_size, dial_len, n_classes) # mask.shape == (batch_size, dial_len, sent_len) sample_weight = tf.gather(train_config.loss_weights, real) # (batch_size, dial_len) loss = loss_object(real, pred, sample_weight=sample_weight) # (batch_size, dial_len) mask = tf.cast(tf.math.not_equal(tf.math.reduce_sum(mask, -1), 0), dtype=loss.dtype) # (batch_size, dial_len) loss *= mask return tf.math.reduce_sum(loss) / tf.math.reduce_sum(mask) train_loss = tf.keras.metrics.Mean(name='train_loss') # train_accuracy = tf.keras.metrics.Accuracy(name='train_accuracy') train_confusion_matrix = metrics.ConfusionMatrix(model_config.n_classes) def train_step(speaker, utterance, emotion): # speaker.shape == (batch_size, 1, dial_len) # emotion.shape == (batch_size, 1, dial_len) # utterance.shape == (batch_size, dial_len, sent_len) speaker = tf.squeeze(speaker) # (batch_size, dial_len) emotion = tf.squeeze(emotion) # (batch_size, dial_len) mask = tf.cast(tf.math.not_equal(utterance, 0), dtype=tf.float32) with tf.GradientTape() as tape: predictions = model(utterance, True, mask) # (batch_size, dial_len, n_classes)
if __name__ == '__main__': cfg.update_from_file("testunet.yaml") cfg.TRAINER_ID = int(os.getenv("PADDLE_TRAINER_ID", 0)) cfg.NUM_TRAINERS = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) cfg.check_and_infer() # print(pprint.pformat(cfg)) dataset = myDataset.SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, shuffle=True, mode=ModelPhase.TRAIN, data_dir=cfg.DATASET.DATA_DIR) conf_mat = metrics.ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=False) i = 0 for img, grt, ignore, imgssrc in dataset.generator(): conf_mat.calculate(grt[np.newaxis, :, :, np.newaxis], grt[np.newaxis, :, :, np.newaxis], ignore[np.newaxis, :, :, np.newaxis]) _, iou = conf_mat.mean_iou() _, acc = conf_mat.accuracy() print(iou, acc) if i > 20: break i += 1 #%% ignore.shape
def train(dataset, model_dir, writer): dataloaders = dataset.get_dataloaders() # we now set GPU training parameters # if the given index is not available then we use index 0 # also when using multi gpu we should specify index 0 if dataset.config.gpu_index + 1 > torch.cuda.device_count( ) or dataset.config.multi_gpu: dataset.config.gpu_index = 0 logging.info('Using GPU cuda:{}, script PID {}'.format( dataset.config.gpu_index, os.getpid())) if dataset.config.multi_gpu: logging.info('Training on multi-GPU mode with {} devices'.format( torch.cuda.device_count())) device = torch.device('cuda:{}'.format(dataset.config.gpu_index)) if dataset.config.model == 'unet': model = unet(input_size=dataset.config.num_feats, num_classes=dataset.config.num_classes, kernel_size=dataset.config.kernel_size).to(device) else: model = deeplab(backbone=dataset.config.backbone, input_size=dataset.config.num_feats, num_classes=dataset.config.num_classes, kernel_size=dataset.config.kernel_size, sigma=dataset.config.sigma).to(device) # if use multi_gou then convert the model to DataParallel if dataset.config.multi_gpu: model = nn.DataParallel(model) # create optimizer, loss function, and lr scheduler optimizer = torch.optim.Adam(model.parameters(), lr=dataset.config.lr, weight_decay=1e-4) criterion = nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=dataset.config.lr_decay, patience=dataset.config.lr_patience, verbose=True) logging.info('Config {}'.format(dataset.config)) logging.info( 'TB logs and checkpoint will be saved in {}'.format(model_dir)) phases = ['train', 'test'] # create metric trackers: we track lass, class accuracy, and overall accuracy trackers = { x: { 'loss': metrics.LossMean(), 'acc': metrics.Accuracy(), 'iou': None, 'cm': metrics.ConfusionMatrix( num_classes=int(dataset.config.num_classes)) } for x in phases } # create initial best state object best_state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() if scheduler else None, 'train_loss': float('inf'), 'test_loss': float('inf'), 'train_acc': 0.0, 'test_acc': 0.0, 'train_mIoU': 0.0, 'test_mIoU': 0.0, 'convergence_epoch': 0, 'num_epochs_since_best_acc': 0 } # now we train! for epoch in range(dataset.config.max_epochs): for phase in phases: if phase == 'train': model.train() else: model.eval() # reset metrics trackers[phase]['loss'].reset() trackers[phase]['cm'].reset() for step_number, inputs in enumerate( tqdm(dataloaders[phase], desc='[{}/{}] {} '.format(epoch + 1, dataset.config.max_epochs, phase))): data = inputs[0].to(device, dtype=torch.float).permute(0, 2, 1) coords = inputs[1].to(device, dtype=torch.float).permute(0, 2, 1) label = inputs[2].to(device, dtype=torch.long) # compute gradients on train only with torch.set_grad_enabled(phase == 'train'): out = model(data, coords) loss = criterion(out, label) if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() # now we update metrics trackers[phase]['loss'].update(average_loss=loss, batch_size=data.size(0)) trackers[phase]['cm'].update(y_true=label, y_logits=out) logging.info('Computing accuracy...') # compare with my metrics epoch_loss = trackers[phase]['loss'].result() epoch_overall_acc = trackers[phase]['cm'].result(metric='accuracy') epoch_iou = trackers[phase]['cm'].result(metric='iou') epoch_miou = epoch_iou.mean() logging.info( '--------------------------------------------------------------------------------' ) logging.info( '[{}/{}] {} Loss: {:.2e}. Overall Acc: {:.4f}. mIoU {:.4f}'. format(epoch + 1, dataset.config.max_epochs, phase, epoch_loss, epoch_overall_acc, epoch_miou)) iou_per_class_str = ' '.join( ['{:.4f}'.format(s) for s in epoch_iou]) logging.info('IoU per class: {}'.format(iou_per_class_str)) logging.info( '--------------------------------------------------------------------------------' ) # we update our learning rate scheduler if loss does not improve if phase == 'test' and scheduler: scheduler.step(epoch_loss) writer.add_scalar('params/lr', optimizer.param_groups[0]['lr'], epoch + 1) writer.add_scalar('loss/epoch_{}'.format(phase), epoch_loss, epoch + 1) writer.add_scalar('miou/epoch_{}'.format(phase), epoch_miou, epoch + 1) writer.add_scalar('acc_all/epoch_{}'.format(phase), epoch_overall_acc, epoch + 1) # after each epoch we update best state values as needed # first we save our state when we get better test accuracy if best_state['test_mIoU'] > trackers['test']['cm'].result( metric='iou').mean(): best_state['num_epochs_since_best_acc'] += 1 else: logging.info('Got a new best model with mIoU {:.4f}'.format( trackers['test']['cm'].result(metric='iou').mean())) best_state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() if scheduler else None, 'train_loss': trackers['train']['loss'].result(), 'test_loss': trackers['test']['loss'].result(), 'train_acc': trackers['train']['cm'].result(metric='accuracy'), 'test_acc': trackers['test']['cm'].result(metric='accuracy'), 'train_mIoU': trackers['train']['cm'].result(metric='iou').mean(), 'test_mIoU': trackers['test']['cm'].result(metric='iou').mean(), 'convergence_epoch': epoch + 1, 'num_epochs_since_best_acc': 0 } file_name = os.path.join(model_dir, 'best_state.pth') torch.save(best_state, file_name) logging.info('saved checkpoint in {}'.format(file_name)) # we check for early stopping when we have trained a min number of epochs if epoch >= dataset.config.min_epochs and best_state[ 'num_epochs_since_best_acc'] >= dataset.config.early_stopping: logging.info('Accuracy did not improve for {} iterations!'.format( dataset.config.early_stopping)) logging.info('[Early stopping]') break utl.dump_best_model_metrics_to_tensorboard(writer, phases, best_state) logging.info('************************** DONE **************************')
def eval_final(): start = timer() """ Evaluated model on test set in an extended way: computes estimates over multiple samples of point clouds and stores predictions """ model.eval() acc_meter2 = tnt.meter.ClassErrorMeter(accuracy=True) acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(5) collected, predictions = defaultdict(list), {} # collect predictions over multiple sampling seeds for ss in range(args.test_multisamp_n): test_dataset_ss = create_dataset(args, ss)[1] loader = torch.utils.data.DataLoader(test_dataset_ss, batch_size=1, collate_fn=spg.eccpc_collate, num_workers=args.nworkers) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,0], targets[:,2:], targets[:,1:].sum(1).float() fname = clouds_data[0][0][:clouds_data[0][0].rfind('.')] embeddings = ptnCloudEmbedder.run(model, *clouds_data[0:4]) outputs = model.ecc(embeddings) C=np.where(np.argmax(np.array(outputs.data),axis=1)==5) Cother=np.where(np.argmax(np.array(outputs.data),axis=1)!=5) for i in range(len(C[0])): fname_G=clouds_data[0][C[0][i]].split('.')[1] hf = h5py.File('/home/data2/qc/large_scalepcss/learning/datasets/tanker/parsed/'+fname + '.h5','r') P = hf[fname_G] if np.shape(P)[0]>2000: tempt1 = np.array(P[:,:13]).T tempt = torch.from_numpy(tempt1.reshape((1,np.shape(tempt1)[0],np.shape(tempt1)[1]))) label_modebran = torch.from_numpy(P[:,-1]).long() if args.cuda: tempt = tempt.cuda() label_modebran = label_modebran.cuda() outputsbran = model.FineModule(Variable(tempt.float(), requires_grad=model.training, volatile=not model.training)) _outputsbran,_label_modebran=filter_valid(outputsbran.data.cpu().numpy(),label_modebran.cpu().numpy()) acc_meter2.add(_outputsbran,_label_modebran) confusion_matrix.count_predicted_batch_branch(_outputsbran, _label_modebran) if len(C[0])>0: collected[fname].append((outputs.data.cpu().numpy()[Cother[0],:], label_mode_cpu.numpy()[[Cother[0]]], label_vec_cpu.numpy()[Cother[0],:])) else: collected[fname].append((outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy())) # aggregate predictions (mean) for fname, lst in collected.items(): o_cpu, t_cpu, tvec_cpu = list(zip(*lst)) if args.test_multisamp_n > 1: o_cpu = np.mean(np.stack(o_cpu,0),0) else: o_cpu = o_cpu[0] t_cpu, tvec_cpu = t_cpu[0], tvec_cpu[0] predictions[fname] = np.argmax(o_cpu,1) o_cpu, t_cpu, tvec_cpu = filter_valid(o_cpu, t_cpu, tvec_cpu) if t_cpu.size > 0: acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu,1)) per_class_iou = {} #perclsiou = confusion_matrix.get_intersection_union_per_class() xuyaohuifu #for c, name in dbinfo['inv_class_map'].items(): # per_class_iou[name] = perclsiou[c] end = timer() return meter_value(acc_meter),meter_value(acc_meter2), confusion_matrix.get_overall_accuracy(), confusion_matrix.get_average_intersection_union(), per_class_iou, predictions, confusion_matrix.get_mean_class_accuracy(), confusion_matrix.get_confusion_matrix(), str(end - start)
def train(epoch): """ Trains for one epoch """ model.train() loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=spg.eccpc_collate, num_workers=args.nworkers, shuffle=True, drop_last=True) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) loss_meter = tnt.meter.AverageValueMeter() #coresponding to Coarse-grained Module loss2_meter = tnt.meter.AverageValueMeter()#coresponding to Fine-grained Module acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) acc_meter2 = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(5) t0 = time.time() # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): t_loader = 1000*(time.time()-t0) model.ecc.set_info(GIs, args.cuda)# put into edge information label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,0], targets[:,2:], targets[:,1:].sum(1) if args.cuda: label_mode, label_vec, segm_size = label_mode_cpu.cuda(), label_vec_cpu.float().cuda(), segm_size_cpu.float().cuda() else: label_mode, label_vec, segm_size = label_mode_cpu, label_vec_cpu.float(), segm_size_cpu.float() optimizer.zero_grad() t0 = time.time() embeddings = ptnCloudEmbedder.run(model, *clouds_data[0:4]) outputs = model.ecc(embeddings) loss = nn.functional.cross_entropy(outputs, Variable(label_mode)) """ Node determination """ mixed_node=np.where(np.argmax(np.array(outputs.data),axis=1)==5)# mixed node Nomixed_node=np.where(np.argmax(np.array(outputs.data),axis=1)!=5) if epoch>args.fine_seg_epo and len(mixed_node[0])>0: loss = nn.functional.cross_entropy(outputs[Nomixed_node[0],:], Variable(label_mode[[Nomixed_node[0]]])) Sum_slice=[] Sum_slice=np.hstack((Sum_slice,mixed_node[0])) for i in range(len(Sum_slice)): fname=args.TANKER_PATH+'/parsed/'+clouds_data[0][mixed_node[0][i]].split('.')[0] + '.h5' fname_G=clouds_data[0][mixed_node[0][i]].split('.')[1] hf = h5py.File(fname,'r') P = hf[fname_G] if np.shape(P)[0]>args.fine_seg_point_num: tempt1 = np.array(P[:,:13]).T tempt = torch.from_numpy(tempt1.reshape((1,np.shape(tempt1)[0],np.shape(tempt1)[1]))) label_modebran = torch.from_numpy(P[:,-1]).long() if args.cuda: tempt = tempt.cuda() label_modebran = label_modebran.cuda() outputsbran = model.FineModule(Variable(tempt.float(), requires_grad=model.training, volatile=not model.training)) loss2 = nn.functional.cross_entropy(outputsbran, Variable(label_modebran)) _outputsbran,_label_modebran=filter_valid(outputsbran.data.cpu().numpy(),label_modebran.cpu().numpy()) loss2.backward() loss2_meter.add(loss2.data[0]) acc_meter2.add(_outputsbran,_label_modebran) confusion_matrix.count_predicted_batch_branch(_outputsbran, _label_modebran) loss.backward() ptnCloudEmbedder.bw_hook() if args.grad_clip>0: for p in model.parameters(): if p.grad is not None: p.grad.data.clamp_(-args.grad_clip, args.grad_clip) optimizer.step() t_trainer = 1000*(time.time()-t0) loss_meter.add(loss.data[0]) # pytorch 0.3 o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) if epoch>args.fine_seg_epo & len(mixed_node[0])>0: acc_meter.add(o_cpu[Nomixed_node[0],:], t_cpu[[Nomixed_node[0]]]) else: acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu[[Nomixed_node[0]]], np.argmax(o_cpu[Nomixed_node[0],:],1)) logging.debug('Batch loss %f, Loader time %f ms, Trainer time %f ms.', loss.data[0], t_loader, t_trainer) t0 = time.time() return acc_meter.value()[0], meter_value(acc_meter2),loss_meter.value()[0], loss2_meter.value()[0],confusion_matrix.get_overall_accuracy(), confusion_matrix.get_average_intersection_union()
def train(config, model_dir, writer): """ Function train and evaluate a part segmentation model for the Shapenet dataset. The training parameters are specified in the config file (for more details see config/config.py). :param config: Dictionary with configuration paramters :param model_dir: Checkpoint save directory :param writer: Tensorboard SummaryWritter object """ phases = ['train', 'test'] datasets, dataloaders = ds.get_modelnet40_dataloaders( root_dir=args.root_dir, phases=phases, batch_size=config['batch_size'], augment=config['augment']) # add number of classes to config config['num_classes'] = 40 # we now set GPU training parameters # if the given index is not available then we use index 0 # also when using multi gpu we should specify index 0 if config['gpu_index'] + 1 > torch.cuda.device_count( ) or config['multi_gpu']: config['gpu_index'] = 0 logging.info('Using GPU cuda:{}, script PID {}'.format( config['gpu_index'], os.getpid())) if config['multi_gpu']: logging.info('Training on multi-GPU mode with {} devices'.format( torch.cuda.device_count())) device = torch.device('cuda:{}'.format(config['gpu_index'])) # we load the model defined in the config file model = res.resnet101(in_channels=config['in_channels'], num_classes=config['num_classes'], kernel_size=config['kernel_size']).to(device) # if use multi_gpu then convert the model to DataParallel if config['multi_gpu']: model = nn.DataParallel(model) # create optimizer, loss function, and lr scheduler optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=1e-4) criterion = nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=config['lr_decay'], patience=config['lr_patience'], verbose=True) logging.info('Config {}'.format(config)) logging.info( 'TB logs and checkpoint will be saved in {}'.format(model_dir)) utl.dump_config_details_to_tensorboard(writer, config) # create metric trackers: we track lass, class accuracy, and overall accuracy trackers = { x: { 'loss': metrics.LossMean(), 'acc': metrics.Accuracy(), 'iou': None, 'cm': metrics.ConfusionMatrix(num_classes=int(config['num_classes'])) } for x in phases } # create initial best state object best_state = { 'config': config, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() if scheduler else None, 'train_loss': float('inf'), 'test_loss': float('inf'), 'train_acc': 0.0, 'test_acc': 0.0, 'train_class_acc': 0.0, 'test_class_acc': 0.0, 'convergence_epoch': 0, 'num_epochs_since_best_acc': 0 } # now we train! for epoch in range(config['max_epochs']): for phase in phases: if phase == 'train': model.train() else: model.eval() # reset metrics trackers[phase]['loss'].reset() trackers[phase]['cm'].reset() for step_number, (data, label) in enumerate( tqdm(dataloaders[phase], desc='[{}/{}] {} '.format(epoch + 1, config['max_epochs'], phase))): data = data.to(device, dtype=torch.float).permute(0, 2, 1) label = label.to(device, dtype=torch.long).squeeze() # compute gradients on train only with torch.set_grad_enabled(phase == 'train'): out = model(data) loss = criterion(out, label) if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() # now we update metrics trackers[phase]['loss'].update(average_loss=loss, batch_size=data.size(0)) trackers[phase]['cm'].update(y_true=label, y_logits=out) # logging.info('Computing accuracy...') # compare with my metrics epoch_loss = trackers[phase]['loss'].result() epoch_overall_acc = trackers[phase]['cm'].result(metric='accuracy') epoch_class_acc = trackers[phase]['cm'].result( metric='class_accuracy').mean() # we update our learning rate scheduler if loss does not improve if phase == 'test' and scheduler: scheduler.step(epoch_loss) writer.add_scalar('params/lr', optimizer.param_groups[0]['lr'], epoch + 1) # log current results and dump in Tensorboard logging.info( '[{}/{}] {} Loss: {:.2e}. Overall Acc: {:.4f}. Class Acc {:.4f}' .format(epoch + 1, config['max_epochs'], phase, epoch_loss, epoch_overall_acc, epoch_class_acc)) writer.add_scalar('loss/epoch_{}'.format(phase), epoch_loss, epoch + 1) writer.add_scalar('acc_class/epoch_{}'.format(phase), epoch_class_acc, epoch + 1) writer.add_scalar('acc_all/epoch_{}'.format(phase), epoch_overall_acc, epoch + 1) # after each epoch we update best state values as needed # first we save our state when we get better test accuracy if best_state['test_acc'] > trackers['test']['cm'].result( metric='accuracy'): best_state['num_epochs_since_best_acc'] += 1 else: logging.info('Got a new best model with accuracy {:.4f}'.format( trackers['test']['cm'].result(metric='accuracy'))) best_state = { 'config': config, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() if scheduler else None, 'train_loss': trackers['train']['loss'].result(), 'test_loss': trackers['test']['loss'].result(), 'train_acc': trackers['train']['cm'].result(metric='accuracy'), 'test_acc': trackers['test']['cm'].result(metric='accuracy'), 'train_class_acc': trackers['train']['cm'].result(metric='class_accuracy').mean(), 'test_class_acc': trackers['test']['cm'].result(metric='class_accuracy').mean(), 'convergence_epoch': epoch + 1, 'num_epochs_since_best_acc': 0 } file_name = os.path.join(model_dir, 'best_state.pth') torch.save(best_state, file_name) logging.info('saved checkpoint in {}'.format(file_name)) # we check for early stopping when we have trained a min number of epochs if epoch >= config['min_epochs'] and best_state[ 'num_epochs_since_best_acc'] >= config['early_stopping']: logging.info('Accuracy did not improve for {} iterations!'.format( config['early_stopping'])) logging.info('[Early stopping]') break utl.dump_best_model_metrics_to_tensorboard(writer, phases, best_state) logging.info('************************** DONE **************************')
def validate(self, epoch): """ Evaluate the model on the validation set. """ losses = AverageMeter() accs = AverageMeter() cmat = metrics.ConfusionMatrix(self.num_classes) auc = metrics.AUC(self.num_classes) for i, (x, y) in enumerate(self.valid_loader): y = y.squeeze() if self.use_gpu: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) # duplicate 10 times x = x.repeat(self.M, 1, 1, 1) # initialize location vector and hidden state self.batch_size = x.shape[0] h_t, l_t = self.reset() # extract the glimpses log_pi = [] baselines = [] for t in range(self.num_glimpses - 1): # forward pass through model h_t, l_t, b_t, p = self.model(x, l_t, h_t) # store baselines.append(b_t) log_pi.append(p) # last iteration h_t, l_t, b_t, log_probas, p = self.model(x, l_t, h_t, last=True) log_pi.append(p) baselines.append(b_t) # convert list to tensors and reshape baselines = torch.stack(baselines).transpose(0, 2) log_pi = torch.stack(log_pi).transpose(0, 2) # average log_probas = log_probas.view(self.M, -1, log_probas.shape[-1]) log_probas = torch.mean(log_probas, dim=0) baselines = baselines.contiguous().view(self.M, -1, self.num_stacks, baselines.shape[-1]) baselines = torch.mean(baselines, dim=0) log_pi = log_pi.contiguous().view(self.M, -1, self.num_stacks, log_pi.shape[-1]) log_pi = torch.mean(log_pi, dim=0) # calculate reward predicted = torch.max(log_probas, 1)[1] R = (predicted.detach() == y).float() R = R.view(R.size(0), 1, 1).repeat(1, self.num_stacks, self.num_glimpses) # compute losses for differentiable modules loss_action = F.nll_loss(log_probas, y) loss_baseline = F.mse_loss(baselines, R) # compute reinforce loss adjusted_reward = R - baselines.detach() loss_reinforce = torch.mean(-log_pi * adjusted_reward) # sum up into a hybrid loss loss = loss_action + loss_baseline + loss_reinforce # compute accuracy correct = (predicted == y).float() acc = 100 * (correct.sum() / len(y)) # store losses.update(loss.data[0], x.size()[0]) accs.update(acc.data[0], x.size()[0]) cmat.add(predicted, y) auc.add(y, log_probas.exp()) # log to tensorboard if self.use_tensorboard: iteration = epoch * len(self.valid_loader) + i log_value('valid_loss', losses.avg, iteration) log_value('valid_acc', accs.avg, iteration) return losses.avg, accs.avg, cmat, auc
def inference(ind, model, loader, args): #function to produce inference data for one data loader #ind : loop position #model : explicit #loader : current data loader #args : look at parser args #create all subfolder if train/val/test split, else don't create new subfolders outs = ["", "", [], []] name = ["train", "val", "test"] #need to match the order of dataloader if args.inf: outs[0] = ost.createDir( args.out + "/inf/" + name[ind]) if args.train_set else args.out + "/inf" if args.dif: outs[1] = ost.createDir( args.out + "/dif/" + name[ind]) if args.train_set else args.out + "/dif" if args.proba: outHM = [ost.createDir(args.out + "/proba/" + c) for c in args.c] outs[2] = [ost.createDir(p + "/" + name[ind]) for p in outHM] if args.train_set else outHM if args.probaH5: outs[3] = ost.createDir( args.out + "/probaH5/" + name[ind]) if args.train_set else args.out + "/probaH5" # print(outs[3]) #get colors as dict for colortable if args.color is not False: c = np.loadtxt(args.color, delimiter=",", dtype=int) colors = {} for i in range(c.shape[0]): colors[int(c[i][0])] = (tuple(c[i][1:5])) else: colors = None #metric containers if args.metric: cm = m.ConfusionMatrix(len(args.c), args.c, args.nodata) #loop through batch given by data reader, unfold tuple for batch_ndx, (imgs, gt, names) in enumerate(tqdm(loader)): #if CUDA, load on GPU if args.cuda: batch_tensor = imgs.cuda() else: batch_tensor = imgs #generate prediction prediction = model(batch_tensor) prediction = prediction.cpu() #build instruction for multiprocess loop (1 instruction for each tile of the batch) if args.noGT: mpArg = [(prediction[i].detach(), None, names[i], outs, args, colors) for i in range(prediction.shape[0])] else: mpArg = [(prediction[i].detach(), gt[i], names[i], outs, args, colors) for i in range(prediction.shape[0])] #multi process loop to create inference data of the current batch with multiprocessing.Pool() as pool: pool.map(multiprocessing_func, mpArg) #calculate batch metric and add it to metric containers if args.metric: for i in range(prediction.size()[0]): pred = prediction[i].argmax(0).squeeze() cm.add_batch(gt[i].numpy(), pred.numpy()) #free memory del imgs del gt del batch_tensor del prediction #produce current dataset metric to metric output folder if args.metric: out_perf = ost.createDir( args.out + "/" + name[ind] + "_perf_inf") if args.train_set else ost.createDir(args.out + "/perf_inf") cm.printPerf(out_perf) return 0
def train(epoch): """ Trains for one epoch """ args.ext_epoch = epoch model.train() loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=spg.eccpc_collate, num_workers=args.nworkers, shuffle=True, drop_last=True) loss_meter = tnt.meter.AverageValueMeter() loss_ext_meter = tnt.meter.AverageValueMeter() loss_att_meter = tnt.meter.AverageValueMeter() acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix( dbinfo['classes'], ignore_label=args.metric_ignore_class) confusion_matrix_ext = metrics.ConfusionMatrix( dbinfo['classes'], ignore_label=args.metric_ignore_class) confusion_matrix_ext_epoch = metrics.ConfusionMatrix( dbinfo['classes'], ignore_label=args.metric_ignore_class) t0 = time.time() epoch_time = time.time() batch_time = AverageMeter() end = time.time() # iterate over dataset in batches for bidx, (targets, GIs, clouds_data, clouds_orig, edges_for_ext, fnames, ext_data, num_sp_list) in enumerate(loader): print('fnames: {}'.format(fnames)) t_loader = time.time() - t0 model.ecc.set_info(GIs, args.cuda) weak_label_mode_cpu, label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:, 0], targets[:, 1], targets[:, 2:], targets[:, 2:].sum( 1 ) ext_mask, extension_sub_list, extension_full_list = ext_data if args.cuda: label_mode, label_vec, segm_size, weak_label_mode = label_mode_cpu.cuda( ), label_vec_cpu.float().cuda(), segm_size_cpu.float().cuda( ), weak_label_mode_cpu.cuda() else: label_mode, label_vec, segm_size, weak_label_mode = label_mode_cpu, label_vec_cpu.float( ), segm_size_cpu.float(), weak_label_mode_cpu optimizer.zero_grad() t0 = time.time() print('num_weak_label/num_sp_all: {}/{}'.format( torch.sum(weak_label_mode < args.n_labels + 1), weak_label_mode.shape[0])) embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs, rnn_fea = model.ecc(embeddings) o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) # extension input = clouds_orig.cuda() edges_for_ext = edges_for_ext.cuda() ext_weak_label_cuda = torch.argmax(outputs, dim=1, keepdim=False) weak_label_cat = weak_label_mode weak_label_cat[ext_mask > 0] = ext_weak_label_cuda[ext_mask > 0] if (epoch > 0) and (epoch % args.ext_epoch_gap == 0): output1, weak_label1, output2, weak_label2, extend_idx, _ = extension_accum2( input, outputs, embeddings, weak_label_cat, edges_for_ext, th=args.extension_th, ext_max=args.single_ext_max) print('{}/{} ~ {:.2f} points with labels.'.format( outputs.shape[0], output1.shape[0], output1.shape[0] / outputs.shape[0] * 100)) print('extension points: {}'.format(extend_idx.shape)) else: extend_idx = torch.Tensor([]) output2 = torch.Tensor([]) weak_label2 = torch.Tensor([]) # loss mask1 = weak_label_mode != args.ignore_label outputs_valid1 = outputs[mask1, :] weak_label1 = weak_label_mode[mask1] loss1_cro = nn.functional.cross_entropy( outputs_valid1, weak_label1, ignore_index=args.ignore_label) ###### extension dropout # labeled points: outputs_valid1, weak_label1 labeled_idx = torch.nonzero(mask1).squeeze().long() # Nsp_label # previous extension points: if torch.sum(ext_mask > 0) > 1: pre_ext_outputs = outputs[ext_mask > 0, :] # Nsp_pre_ext pre_ext_fea = rnn_fea[ext_mask > 0, :] pre_ext_pseudo_label = torch.argmax(pre_ext_outputs, dim=1, keepdim=False) pre_ext_idx = torch.nonzero(ext_mask > 0).squeeze().long() # current extension points: if extend_idx.shape[0] > 1: cur_ext_outputs = output2 cur_ext_pred_label = weak_label2 cur_ext_idx = extend_idx cur_ext_fea = rnn_fea[extend_idx, :] # extension concat if (torch.sum(ext_mask > 0) > 1) & (extend_idx.shape[0] > 1): ext_outputs_cat = torch.cat((pre_ext_outputs, cur_ext_outputs), 0) ext_label_cat = torch.cat((pre_ext_pseudo_label.unsqueeze(-1), cur_ext_pred_label.unsqueeze(-1)), 0).squeeze(-1) ext_idx_cat = torch.cat( (pre_ext_idx.unsqueeze(-1), cur_ext_idx.unsqueeze(-1)), 0).squeeze(-1) ext_fea_cat = torch.cat((pre_ext_fea, cur_ext_fea), 0) elif extend_idx.shape[0] > 1: # only current extension points ext_outputs_cat = cur_ext_outputs ext_label_cat = cur_ext_pred_label ext_idx_cat = cur_ext_idx ext_fea_cat = cur_ext_fea elif torch.sum(ext_mask > 0) > 1: # only previous extension points ext_outputs_cat = pre_ext_outputs ext_label_cat = pre_ext_pseudo_label ext_idx_cat = pre_ext_idx ext_fea_cat = pre_ext_fea else: ext_outputs_cat = None ext_label_cat = None ext_idx_cat = None ext_fea_cat = None # compute the cluster center and the distances and then dropout with ratio if (ext_idx_cat is not None) and (ext_idx_cat.shape[0] > 20): ext_idxs_sample = [ ] # sampled id of each extension points in all the extension points unique_classes = torch.unique(weak_label1) ext_idx_idx = torch.Tensor( list(range(ext_idx_cat.shape[0])) ).cuda( ) # id of each extension points in all the extension points for i in range(unique_classes.shape[0]): sp = unique_classes[i] fea_label = outputs_valid1[weak_label1 == sp] # Nsp_label_class*13 fea_ext = ext_outputs_cat[ext_label_cat == sp] # Nsp_ext_class*13 ext_idxs = ext_idx_idx[ext_label_cat == sp] # Nsp_ext_class if (fea_ext.shape[0] > 5) & (fea_label.shape[0] > 0): num_retain = math.floor(fea_ext.shape[0] * args.ext_drop) cluster_center = torch.sum( fea_label, dim=0, keepdim=True) + 0.5 * torch.sum( fea_ext, dim=0, keepdim=True) cluster_center = cluster_center / ( fea_label.shape[0] + fea_ext.shape[0]) # 1*13 dis = fea_ext - cluster_center # Nsp_ext*13 dis = torch.norm(dis, dim=1) # Nsp _, idxs = torch.sort(dis, dim=0, descending=False) ext_idxs_sample.append( ext_idxs[idxs[:num_retain]].unsqueeze(-1)) elif len(ext_idxs) > 0: ext_idxs_sample.append(ext_idxs.unsqueeze(-1)) ext_idxs_sample = torch.cat(ext_idxs_sample, 0).squeeze(-1).long() ext_idxs_retain = ext_idx_cat[ext_idxs_sample] ext_output_retain = ext_outputs_cat[ext_idxs_sample, :] ext_fea_retain = ext_fea_cat[ext_idxs_sample, :] ext_label_retain = ext_label_cat[ext_idxs_sample] else: ext_idxs_retain = ext_idx_cat ext_output_retain = ext_outputs_cat ext_fea_retain = ext_fea_cat ext_label_retain = ext_label_cat if (ext_idxs_retain is not None) and (ext_idxs_retain.shape[0] > 2): lab_fea = rnn_fea[ labeled_idx, :] # M*352, including the previous extension points lab_lab = weak_label1 if lab_fea.shape[0] > args.max_labeled_att: ii = random.sample(range(lab_fea.shape[0]), k=args.max_labeled_att) lab_fea = lab_fea[ii, :] lab_lab = lab_lab[ii] lab_idxs_sample = ii if ext_idxs_retain.shape[0] > args.max_ext_att_loss: ii = random.sample(range(ext_idxs_retain.shape[0]), k=args.max_ext_att_loss) # ext_idxs_retain_sample = ext_idxs_retain[ii] ext_idxs_retain_sample = ii else: # ext_idxs_retain_sample = ext_idxs_retain ext_idxs_retain_sample = list( range(ext_idxs_retain.shape[0])) ext_fea = rnn_fea[ext_idxs_retain_sample, :] # N*352 # coupled attention outputs_att_lab = model.att_lab(lab_fea, ext_fea) outputs_att_ext = model.att_ext(ext_fea, lab_fea) loss_att_lab_cro = nn.functional.cross_entropy( outputs_att_lab, lab_lab) loss_att_meter.add(loss_att_lab_cro.item()) loss_att_ext_cro = nn.functional.cross_entropy( outputs_att_ext, ext_label_retain[ext_idxs_retain_sample]) loss_ext_meter.add(loss_att_ext_cro.item()) loss = args.loss_w1 * loss1_cro + args.loss_w2 * ( loss_att_lab_cro + loss_att_ext_cro) confusion_matrix_ext.count_predicted_batch( tvec_cpu[ext_idxs_retain.data.cpu().numpy(), :], np.argmax(outputs[ext_idxs_retain, :].data.cpu().numpy(), 1)) print('{} point extend. acc: {:3f}, macc: {:3f}'.format( ext_idxs_retain.shape[0], confusion_matrix_ext.get_overall_accuracy(), confusion_matrix_ext.get_mean_class_accuracy())) if extend_idx.shape[0] > 1: confusion_matrix_ext_epoch.count_predicted_batch( tvec_cpu[extend_idx.data.cpu().numpy(), :], np.argmax(outputs[extend_idx, :].data.cpu().numpy(), 1)) print( '{} point extend current epoch. acc: {:3f}, macc: {:3f}' .format( extend_idx.shape[0], confusion_matrix_ext_epoch.get_overall_accuracy(), confusion_matrix_ext_epoch.get_mean_class_accuracy( ))) # update the extension extend_idx = ext_idxs_retain.data.cpu().numpy().astype( np.int32) weak_label2 = ext_label_retain.data.cpu().numpy().astype( np.int32) num_sp_array = np.cumsum(np.array(num_sp_list)) for b in range(num_sp_array.shape[0]): if b == 0: sp_start = 0 else: sp_start = num_sp_array[b - 1] sp_end = num_sp_array[b] mask = (extend_idx >= sp_start) & (extend_idx < sp_end) if np.sum(mask) > 0: extend_idx_batch = extend_idx[mask] - sp_start extend_label_batch = weak_label2[mask] extension_sub_batch = extension_sub_list[b].astype( np.int32) # Nsp*2 extension_full_batch = extension_full_list[b].astype( np.int32) # N*2 extension_full_batch[extension_sub_batch[ extend_idx_batch, 0]] = extend_label_batch current_fname = fnames[b] np.savetxt(os.path.join( args.extension_dir, 'epoch_{:d}'.format( int(epoch // args.ext_epoch_gap)), '{}.txt'.format(current_fname)), extension_full_batch, fmt='%d') else: loss = loss1_cro loss.backward() ptnCloudEmbedder.bw_hook() if args.grad_clip > 0: for p in model.parameters(): if p.grad is not None: p.grad.data.clamp_(-args.grad_clip, args.grad_clip) optimizer.step() t_trainer = time.time() - t0 # loss_meter.add(loss.data[0]) # pytorch 0.3 loss_meter.add(loss.item()) # pytorch 0.4 acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu, 1)) batch_time.update(time.time() - end) end = time.time() print( 'Batch {}/{} - loss {:.3f}/{:.3f}, acc {:.3f}, lr {:.3f}, Loader time {:.3f}, Trainer time {:.3f}, Batch time {:.3f}/{:.3f}.' .format(bidx + 1, len(loader), loss.item(), loss_meter.value()[0], confusion_matrix.get_overall_accuracy(), get_lr(optimizer), t_loader, t_trainer, batch_time.val, batch_time.avg)) t0 = time.time() if args.ext_epoch % args.ext_epoch_gap == ( args.ext_epoch_gap - 1): # a new extension folder need to be built shutil.copytree( os.path.join( args.extension_dir, 'epoch_{}'.format( int(args.ext_epoch // args.ext_epoch_gap))), os.path.join( args.extension_dir, 'epoch_{}'.format( int(args.ext_epoch // args.ext_epoch_gap) + 1))) return acc_meter.value()[0], confusion_matrix.get_overall_accuracy(), confusion_matrix.get_mean_class_accuracy(), confusion_matrix.get_average_intersection_union(), loss_meter.value()[0], time.time()-epoch_time, \ confusion_matrix_ext_epoch.get_overall_accuracy(), confusion_matrix_ext_epoch.get_mean_class_accuracy(), confusion_matrix_ext_epoch.get_average_intersection_union()