def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output): latest = None try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) # The validation dataset is always fully loaded, so we fix a very high number of hours g_conf.NUMBER_OF_HOURS = 10000 set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') if suppress_output: sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=dataset_name) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) # Set ERFnet for segmentation model_erf = ERFNet(20) model_erf = torch.nn.DataParallel(model_erf) model_erf = model_erf.cuda() print("LOAD ERFNet - validate") def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model_erf = load_my_state_dict( model_erf, torch.load(os.path.join('trained_models/erfnet_pretrained.pth'))) model_erf.eval() print("ERFNet and weights LOADED successfully") # The window used to keep track of the trainings l1_window = [] latest = get_latest_evaluated_checkpoint() if latest is not None: # When latest is noe l1_window = coil_logger.recover_loss_window(dataset_name, None) model.cuda() best_mse = 1000 best_error = 1000 best_mse_iter = 0 best_error_iter = 0 while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) model.load_state_dict(checkpoint['state_dict']) model.eval() accumulated_mse = 0 accumulated_error = 0 iteration_on_checkpoint = 0 for data in data_loader: # Compute the forward pass on a batch from the validation dataset controls = data['directions'] # Seg batch rgbs = data['rgb'] with torch.no_grad(): outputs = model_erf(rgbs) labels = outputs.max(1)[1].byte().cpu().data seg_road = (labels == 0) seg_not_road = (labels != 0) seg = torch.stack((seg_road, seg_not_road), 1).float() output = model.forward_branch( torch.squeeze(seg).cuda(), dataset.extract_inputs(data).cuda(), controls) # output = model.foward_branch(torch.squeeze(rgbs).cuda(), # dataset.extract_inputs(data).cuda(),controls) # It could be either waypoints or direct control if 'waypoint1_angle' in g_conf.TARGETS: write_waypoints_output(checkpoint_iteration, output) else: write_regular_output(checkpoint_iteration, output) mse = torch.mean( (output - dataset.extract_targets(data).cuda() )**2).data.tolist() mean_error = torch.mean( torch.abs(output - dataset.extract_targets(data).cuda()) ).data.tolist() accumulated_error += mean_error accumulated_mse += mse error = torch.abs(output - dataset.extract_targets(data).cuda()) # Log a random position position = random.randint(0, len(output.data.tolist()) - 1) coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'MSE': mse, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data) [position].data.tolist() }, latest) iteration_on_checkpoint += 1 print("Iteration %d on Checkpoint %d : Error %f" % (iteration_on_checkpoint, checkpoint_iteration, mean_error)) """ ######## Finish a round of validation, write results, wait for the next ######## """ checkpoint_average_mse = accumulated_mse / (len(data_loader)) checkpoint_average_error = accumulated_error / ( len(data_loader)) coil_logger.add_scalar('Loss', checkpoint_average_mse, latest, True) coil_logger.add_scalar('Error', checkpoint_average_error, latest, True) if checkpoint_average_mse < best_mse: best_mse = checkpoint_average_mse best_mse_iter = latest if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest coil_logger.add_message( 'Iterating', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_mse, 'BestError': best_error, 'BestMSE': best_mse, 'BestMSECheckpoint': best_mse_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest }, latest) l1_window.append(checkpoint_average_error) coil_logger.write_on_error_csv(dataset_name, checkpoint_average_error) # If we are using the finish when validation stops, we check the current if g_conf.FINISH_ON_VALIDATION_STALE is not None: if dlib.count_steps_without_decrease(l1_window) > 3 and \ dlib.count_steps_without_decrease_robust(l1_window) > 3: coil_logger.write_stop(dataset_name, latest) break else: latest = get_latest_evaluated_checkpoint() time.sleep(1) coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'}) print("Waiting for the next Validation") coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def execute(gpu, exp_batch, exp_alias, json_file_path, suppress_output, encoder_params=None, plot_attentions=False): try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu if json_file_path is not None: json_file_name = json_file_path.split('/')[-1].split('.')[-2] else: raise RuntimeError( "You need to define the validation json file path") # At this point the log file with the correct naming is created. merge_with_yaml( os.path.join('configs', exp_batch, exp_alias + '.yaml'), encoder_params) if plot_attentions: set_type_of_process('validation', json_file_name + '_plotAttention') else: set_type_of_process('validation', json_file_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') if suppress_output: sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # We create file for saving validation results summary_file = os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME, g_conf.PROCESS_NAME + '_csv', 'valid_summary_1camera.csv') g_conf.immutable(False) g_conf.DATA_USED = 'central' g_conf.immutable(True) if not os.path.exists(summary_file): csv_outfile = open(summary_file, 'w') csv_outfile.write( "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" % ('step', 'accumulated_pedestrian_TP', 'accumulated_pedestrian_FP', 'accumulated_pedestrian_FN', 'accumulated_pedestrian_TN', 'accumulated_vehicle_stop_TP', 'accumulated_vehicle_stop_FP', 'accumulated_vehicle_stop_FN', 'accumulated_vehicle_stop_TN', 'accumulated_red_tl_TP', 'accumulated_red_tl_FP', 'accumulated_red_tl_FN', 'accumulated_red_tl_TN', 'MAE_relative_angle')) csv_outfile.close() latest = get_latest_evaluated_checkpoint_2(summary_file) # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the HDFILES positions from the root directory as a in a vector. #full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(transform=augmenter, preload_name=g_conf.PROCESS_NAME + '_' + g_conf.DATA_USED, process_type='validation', vd_json_file_path=json_file_path) print("Loaded Validation dataset") # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) if g_conf.MODEL_TYPE in ['one-step-affordances']: # one step training, no need to retrain FC layers, we just get the output of encoder model as prediciton model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) model.cuda() #print(model) elif g_conf.MODEL_TYPE in ['separate-affordances']: model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.ENCODER_MODEL_CONFIGURATION) model.cuda() #print(model) encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) encoder_model.cuda() encoder_model.eval() # Here we load the pre-trained encoder (not fine-tunned) if g_conf.FREEZE_ENCODER: if encoder_params is not None: encoder_checkpoint = torch.load( os.path.join( '_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints', str(encoder_params['encoder_checkpoint']) + '.pth')) print( "Encoder model ", str(encoder_params['encoder_checkpoint']), "loaded from ", os.path.join('_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints')) encoder_model.load_state_dict( encoder_checkpoint['state_dict']) encoder_model.eval() for param_ in encoder_model.parameters(): param_.requires_grad = False while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): latest = get_next_checkpoint_2(g_conf.TEST_SCHEDULE, summary_file) if os.path.exists( os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME, 'checkpoints', str(latest) + '.pth')): checkpoint = torch.load( os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] model.load_state_dict(checkpoint['state_dict']) print("Validation checkpoint ", checkpoint_iteration) model.eval() for param_ in model.parameters(): param_.requires_grad = False # Here we load the fine-tunned encoder if not g_conf.FREEZE_ENCODER and g_conf.MODEL_TYPE not in [ 'one-step-affordances' ]: encoder_checkpoint = torch.load( os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME, 'checkpoints', str(latest) + '_encoder.pth')) print( "FINE TUNNED encoder model ", str(latest) + '_encoder.pth', "loaded from ", os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME, 'checkpoints')) encoder_model.load_state_dict( encoder_checkpoint['state_dict']) encoder_model.eval() for param_ in encoder_model.parameters(): param_.requires_grad = False accumulated_mae_ra = 0 accumulated_pedestrian_TP = 0 accumulated_pedestrian_TN = 0 accumulated_pedestrian_FN = 0 accumulated_pedestrian_FP = 0 accumulated_red_tl_TP = 0 accumulated_red_tl_TN = 0 accumulated_red_tl_FP = 0 accumulated_red_tl_FN = 0 accumulated_vehicle_stop_TP = 0 accumulated_vehicle_stop_TN = 0 accumulated_vehicle_stop_FP = 0 accumulated_vehicle_stop_FN = 0 iteration_on_checkpoint = 0 for data in data_loader: if g_conf.MODEL_TYPE in ['one-step-affordances']: c_output, r_output, layers = model.forward_outputs( torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda(), dataset.extract_commands(data).cuda()) elif g_conf.MODEL_TYPE in ['separate-affordances']: if g_conf.ENCODER_MODEL_TYPE in [ 'action_prediction', 'stdim', 'ETEDIM', 'FIMBC', 'one-step-affordances' ]: e, layers = encoder_model.forward_encoder( torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda(), torch.squeeze( dataset.extract_commands(data).cuda())) c_output, r_output = model.forward_test(e) elif g_conf.ENCODER_MODEL_TYPE in [ 'ETE', 'ETE_inverse_model', 'forward', 'ETE_stdim' ]: e, layers = encoder_model.forward_encoder( torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda(), torch.squeeze( dataset.extract_commands(data).cuda())) c_output, r_output = model.forward_test(e) if plot_attentions: attentions_path = os.path.join( '_logs', exp_batch, g_conf.EXPERIMENT_NAME, g_conf.PROCESS_NAME + '_attentions_' + str(latest)) write_attentions(torch.squeeze(data['rgb']), layers, iteration_on_checkpoint, attentions_path) # Accurancy = (TP+TN)/(TP+TN+FP+FN) # F1-score = 2*TP / (2*TP + FN + FP) classification_gt = dataset.extract_affordances_targets( data, 'classification') regression_gt = dataset.extract_affordances_targets( data, 'regression') TP = 0 FN = 0 FP = 0 TN = 0 for i in range(classification_gt.shape[0]): if classification_gt[i, 0] == ( c_output[0][i, 0] < c_output[0][i, 1]).type( torch.FloatTensor) == 1: TP += 1 elif classification_gt[ i, 0] == 1 and classification_gt[i, 0] != ( c_output[0][i, 0] < c_output[0][i, 1]).type(torch.FloatTensor): FN += 1 elif classification_gt[ i, 0] == 0 and classification_gt[i, 0] != ( c_output[0][i, 0] < c_output[0][i, 1]).type(torch.FloatTensor): FP += 1 if classification_gt[i, 0] == ( c_output[0][i, 0] < c_output[0][i, 1]).type( torch.FloatTensor) == 0: TN += 1 accumulated_pedestrian_TP += TP accumulated_pedestrian_TN += TN accumulated_pedestrian_FP += FP accumulated_pedestrian_FN += FN TP = 0 FN = 0 FP = 0 TN = 0 for i in range(classification_gt.shape[0]): if classification_gt[i, 1] == ( c_output[1][i, 0] < c_output[1][i, 1]).type( torch.FloatTensor) == 1: TP += 1 elif classification_gt[ i, 1] == 1 and classification_gt[i, 1] != ( c_output[1][i, 0] < c_output[1][i, 1]).type(torch.FloatTensor): FN += 1 elif classification_gt[ i, 1] == 0 and classification_gt[i, 1] != ( c_output[1][i, 0] < c_output[1][i, 1]).type(torch.FloatTensor): FP += 1 if classification_gt[i, 1] == ( c_output[1][i, 0] < c_output[1][i, 1]).type( torch.FloatTensor) == 0: TN += 1 accumulated_red_tl_TP += TP accumulated_red_tl_TN += TN accumulated_red_tl_FP += FP accumulated_red_tl_FN += FN TP = 0 FN = 0 FP = 0 TN = 0 for i in range(classification_gt.shape[0]): if classification_gt[i, 2] == ( c_output[2][i, 0] < c_output[2][i, 1]).type( torch.FloatTensor) == 1: TP += 1 elif classification_gt[i, 2] == 1 and classification_gt[i, 2] !=\ (c_output[2][i, 0] < c_output[2][i, 1]).type(torch.FloatTensor): FN += 1 elif classification_gt[i, 2] == 0 and classification_gt[i, 2] !=\ (c_output[2][i, 0] < c_output[2][i, 1]).type(torch.FloatTensor): FP += 1 if classification_gt[i, 2] == ( c_output[2][i, 0] < c_output[2][i, 1]).type( torch.FloatTensor) == 0: TN += 1 accumulated_vehicle_stop_TP += TP accumulated_vehicle_stop_TN += TN accumulated_vehicle_stop_FP += FP accumulated_vehicle_stop_FN += FN # if the data was normalized during training, we need to transform it to its unit write_regular_output(checkpoint_iteration, torch.squeeze(r_output[0]), regression_gt[:, 0]) mae_ra = torch.abs(regression_gt[:, 0] - torch.squeeze(r_output[0]).type(torch.FloatTensor)).\ numpy() accumulated_mae_ra += np.sum(mae_ra) if iteration_on_checkpoint % 100 == 0: print( "Validation iteration: %d [%d/%d)] on Checkpoint %d " % (iteration_on_checkpoint, iteration_on_checkpoint, len(data_loader), checkpoint_iteration)) iteration_on_checkpoint += 1 # Here also need a better analysis. TODO divide into curve and other things MAE_relative_angle = accumulated_mae_ra / (len(dataset)) csv_outfile = open(summary_file, 'a') csv_outfile.write( "%s, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f" % (checkpoint_iteration, accumulated_pedestrian_TP, accumulated_pedestrian_FP, accumulated_pedestrian_FN, accumulated_pedestrian_TN, accumulated_vehicle_stop_TP, accumulated_vehicle_stop_FP, accumulated_vehicle_stop_FN, accumulated_vehicle_stop_TN, accumulated_red_tl_TP, accumulated_red_tl_FP, accumulated_red_tl_FN, accumulated_red_tl_TN, MAE_relative_angle)) csv_outfile.write("\n") csv_outfile.close() else: print('The checkpoint you want to validate is not yet ready ', str(latest)) coil_logger.add_message('Finished', {}) print('VALIDATION FINISHED !!') print(' Validation results saved in ==> ', summary_file) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output): latest = None try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) # The validation dataset is always fully loaded, so we fix a very high number of hours g_conf.NUMBER_OF_HOURS = 10000 set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') if suppress_output: sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # Define the dataset. full_dataset = [ os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) ] augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_names=[dataset_name]) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) # Create model. model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) # The window used to keep track of the validation loss l1_window = [] # If we have evaluated a checkpoint, get the validation losses of all the previously # evaluated checkpoints (validation loss is used for early stopping) latest = get_latest_evaluated_checkpoint() if latest is not None: # When latest is noe l1_window = coil_logger.recover_loss_window(dataset_name, None) model.cuda() best_mse = 1000 best_error = 1000 best_mse_iter = 0 best_error_iter = 0 # Loop to validate all checkpoints as they are saved during training while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): with torch.no_grad(): # Get and load latest checkpoint latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) model.load_state_dict(checkpoint['state_dict']) model.eval() accumulated_mse = 0 accumulated_error = 0 iteration_on_checkpoint = 0 if g_conf.USE_REPRESENTATION_LOSS: accumulated_perception_rep_mse = 0 accumulated_speed_rep_mse = 0 accumulated_intentions_rep_mse = 0 accumulated_rep_mse = 0 accumulated_perception_rep_error = 0 accumulated_speed_rep_error = 0 accumulated_intentions_rep_error = 0 accumulated_rep_error = 0 # Validation loop for data in data_loader: # Compute the forward pass on a batch from the validation dataset controls = data['directions'] # Run model forward and get outputs # First case corresponds to squeeze network, second case corresponds to driving model without # mimicking losses, last case corresponds to mimic network if "seg" in g_conf.SENSORS.keys(): output = model.forward_branch( data, dataset.extract_inputs(data).cuda(), controls, dataset.extract_intentions(data).cuda()) elif not g_conf.USE_REPRESENTATION_LOSS: output = model.forward_branch( data, dataset.extract_inputs(data).cuda(), controls) else: output, intermediate_reps = model.forward_branch( data, dataset.extract_inputs(data).cuda(), controls) write_regular_output(checkpoint_iteration, output) # Compute control loss on current validation batch and accumulate it targets_to_use = dataset.extract_targets(data) mse = torch.mean( (output - targets_to_use.cuda())**2).data.tolist() mean_error = torch.mean( torch.abs(output - targets_to_use.cuda())).data.tolist() accumulated_error += mean_error accumulated_mse += mse error = torch.abs(output - targets_to_use.cuda()) # Compute mimicking losses on current validation batch and accumulate it if g_conf.USE_REPRESENTATION_LOSS: expert_reps = dataset.extract_representations(data) # First L1 losses (seg mask, speed, intention mimicking losses) if g_conf.USE_PERCEPTION_REP_LOSS: perception_rep_loss = torch.sum( torch.abs(intermediate_reps[0] - expert_reps[0].cuda()) ).data.tolist() / (3 * output.shape[0]) else: perception_rep_loss = 0 if g_conf.USE_SPEED_REP_LOSS: speed_rep_loss = torch.sum( torch.abs(intermediate_reps[1] - expert_reps[1].cuda()) ).data.tolist() / (3 * output.shape[0]) else: speed_rep_loss = 0 if g_conf.USE_INTENTION_REP_LOSS: intentions_rep_loss = torch.sum( torch.abs(intermediate_reps[2] - expert_reps[2].cuda()) ).data.tolist() / (3 * output.shape[0]) else: intentions_rep_loss = 0 rep_error = g_conf.REP_LOSS_WEIGHT * ( perception_rep_loss + speed_rep_loss + intentions_rep_loss) accumulated_perception_rep_error += perception_rep_loss accumulated_speed_rep_error += speed_rep_loss accumulated_intentions_rep_error += intentions_rep_loss accumulated_rep_error += rep_error # L2 losses now if g_conf.USE_PERCEPTION_REP_LOSS: perception_rep_loss = torch.sum( (intermediate_reps[0] - expert_reps[0].cuda())** 2).data.tolist() / (3 * output.shape[0]) else: perception_rep_loss = 0 if g_conf.USE_SPEED_REP_LOSS: speed_rep_loss = torch.sum( (intermediate_reps[1] - expert_reps[1].cuda())** 2).data.tolist() / (3 * output.shape[0]) else: speed_rep_loss = 0 if g_conf.USE_INTENTION_REP_LOSS: intentions_rep_loss = torch.sum( (intermediate_reps[2] - expert_reps[2].cuda())** 2).data.tolist() / (3 * output.shape[0]) else: intentions_rep_loss = 0 rep_mse = g_conf.REP_LOSS_WEIGHT * ( perception_rep_loss + speed_rep_loss + intentions_rep_loss) accumulated_perception_rep_mse += perception_rep_loss accumulated_speed_rep_mse += speed_rep_loss accumulated_intentions_rep_mse += intentions_rep_loss accumulated_rep_mse += rep_mse # Log a random position position = random.randint( 0, len(output.data.tolist()) - 1) # Logging if g_conf.USE_REPRESENTATION_LOSS: total_mse = mse + rep_mse total_error = mean_error + rep_error coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'MSE': mse, 'RepMeanError': rep_error, 'RepMSE': rep_mse, 'MeanTotalError': total_error, 'TotalMSE': total_mse, 'Output': output[position].data.tolist(), 'GroundTruth': targets_to_use[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs( data)[position].data.tolist() }, latest) else: coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'MSE': mse, 'Output': output[position].data.tolist(), 'GroundTruth': targets_to_use[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs( data)[position].data.tolist() }, latest) iteration_on_checkpoint += 1 if g_conf.USE_REPRESENTATION_LOSS: print("Iteration %d on Checkpoint %d : Error %f" % (iteration_on_checkpoint, checkpoint_iteration, total_error)) else: print("Iteration %d on Checkpoint %d : Error %f" % (iteration_on_checkpoint, checkpoint_iteration, mean_error)) """ ######## Finish a round of validation, write results, wait for the next ######## """ # Compute average L1 and L2 losses over whole round of validation and log them checkpoint_average_mse = accumulated_mse / ( len(data_loader)) checkpoint_average_error = accumulated_error / ( len(data_loader)) coil_logger.add_scalar('L2 Loss', checkpoint_average_mse, latest, True) coil_logger.add_scalar('Loss', checkpoint_average_error, latest, True) if g_conf.USE_REPRESENTATION_LOSS: checkpoint_average_perception_rep_mse = accumulated_perception_rep_mse / ( len(data_loader)) checkpoint_average_speed_rep_mse = accumulated_speed_rep_mse / ( len(data_loader)) checkpoint_average_intentions_rep_mse = accumulated_intentions_rep_mse / ( len(data_loader)) checkpoint_average_rep_mse = accumulated_rep_mse / ( len(data_loader)) checkpoint_average_total_mse = checkpoint_average_mse + checkpoint_average_rep_mse checkpoint_average_perception_rep_error = accumulated_perception_rep_error / ( len(data_loader)) checkpoint_average_speed_rep_error = accumulated_speed_rep_error / ( len(data_loader)) checkpoint_average_intentions_rep_error = accumulated_intentions_rep_error / ( len(data_loader)) checkpoint_average_rep_error = accumulated_rep_error / ( len(data_loader)) checkpoint_average_total_error = checkpoint_average_error + checkpoint_average_rep_mse # Log L1/L2 loss terms coil_logger.add_scalar( 'Perception Rep Loss', checkpoint_average_perception_rep_mse, latest, True) coil_logger.add_scalar( 'Speed Rep Loss', checkpoint_average_speed_rep_mse, latest, True) coil_logger.add_scalar( 'Intentions Rep Loss', checkpoint_average_intentions_rep_mse, latest, True) coil_logger.add_scalar('Overall Rep Loss', checkpoint_average_rep_mse, latest, True) coil_logger.add_scalar('Total L2 Loss', checkpoint_average_total_mse, latest, True) coil_logger.add_scalar( 'Perception Rep Error', checkpoint_average_perception_rep_error, latest, True) coil_logger.add_scalar( 'Speed Rep Error', checkpoint_average_speed_rep_error, latest, True) coil_logger.add_scalar( 'Intentions Rep Error', checkpoint_average_intentions_rep_error, latest, True) coil_logger.add_scalar('Total Rep Error', checkpoint_average_rep_error, latest, True) coil_logger.add_scalar('Total Loss', checkpoint_average_total_error, latest, True) else: checkpoint_average_total_mse = checkpoint_average_mse checkpoint_average_total_error = checkpoint_average_error if checkpoint_average_total_mse < best_mse: best_mse = checkpoint_average_total_mse best_mse_iter = latest if checkpoint_average_total_error < best_error: best_error = checkpoint_average_total_error best_error_iter = latest # Print for logging / to terminal validation results if g_conf.USE_REPRESENTATION_LOSS: coil_logger.add_message( 'Iterating', { 'Summary': { 'Control Error': checkpoint_average_error, 'Control Loss': checkpoint_average_mse, 'Rep Error': checkpoint_average_rep_error, 'Rep Loss': checkpoint_average_rep_mse, 'Error': checkpoint_average_total_error, 'Loss': checkpoint_average_total_mse, 'BestError': best_error, 'BestMSE': best_mse, 'BestMSECheckpoint': best_mse_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest }, latest) else: coil_logger.add_message( 'Iterating', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_mse, 'BestError': best_error, 'BestMSE': best_mse, 'BestMSECheckpoint': best_mse_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest }, latest) # Save validation loss history (validation loss is used for early stopping) l1_window.append(checkpoint_average_total_error) coil_logger.write_on_error_csv( dataset_name, checkpoint_average_total_error) # Early stopping if g_conf.FINISH_ON_VALIDATION_STALE is not None: if dlib.count_steps_without_decrease(l1_window) > 3 and \ dlib.count_steps_without_decrease_robust(l1_window) > 3: coil_logger.write_stop(dataset_name, latest) break else: latest = get_latest_evaluated_checkpoint() time.sleep(1) coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'}) print("Waiting for the next Validation") coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def execute(gpu, exp_batch, exp_alias, drive_conditions, params): """ Main loop function. Executes driving benchmarks the specified iterations. Args: gpu: exp_batch: exp_alias: drive_conditions: params: Returns: """ try: print("Running ", __file__, " On GPU ", gpu, "of experiment name ", exp_alias) os.environ["CUDA_VISIBLE_DEVICES"] = gpu if not os.path.exists('_output_logs'): os.mkdir('_output_logs') merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) exp_set_name, town_name = drive_conditions.split('_') experiment_suite_module = __import__( 'drive.suites.' + camelcase_to_snakecase(exp_set_name) + '_suite', fromlist=[exp_set_name]) experiment_suite_module = getattr(experiment_suite_module, exp_set_name) experiment_set = experiment_suite_module() set_type_of_process('drive', drive_conditions) if params['suppress_output']: sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) coil_logger.add_message( 'Loading', {'Poses': experiment_set.build_experiments()[0].poses}) if g_conf.USE_ORACLE: control_filename = 'control_output_auto' else: control_filename = 'control_output' """ ##### Preparing the output files that will contain the driving summary ##### """ experiment_list = experiment_set.build_experiments() # Get all the uniquely named tasks task_list = unique( [experiment.task_name for experiment in experiment_list]) # Now actually run the driving_benchmark latest = get_latest_evaluated_checkpoint(control_filename + '_' + task_list[0]) if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 # The used tasks are hardcoded, this need to be improved file_base = os.path.join('_logs', exp_batch, exp_alias, g_conf.PROCESS_NAME + '_csv', control_filename) for i in range(len(task_list)): # Write the header of the summary file used conclusion # While the checkpoint is not there write_header_control_summary(file_base, task_list[i]) """ ###### Run a single driving benchmark specified by the checkpoint were validation is stale ###### """ if g_conf.FINISH_ON_VALIDATION_STALE is not None: while validation_stale_point( g_conf.FINISH_ON_VALIDATION_STALE) is None: time.sleep(0.1) validation_state_iteration = validation_stale_point( g_conf.FINISH_ON_VALIDATION_STALE) driving_benchmark(validation_state_iteration, gpu, town_name, experiment_set, exp_batch, exp_alias, params, control_filename, task_list) else: """ ##### Main Loop , Run a benchmark for each specified checkpoint on the "Test Configuration" ##### """ while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): # Get the correct checkpoint # We check it for some task name, all of then are ready at the same time if is_next_checkpoint_ready( g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]): latest = get_next_checkpoint( g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]) driving_benchmark(latest, gpu, town_name, experiment_set, exp_batch, exp_alias, params, control_filename, task_list) else: time.sleep(0.1) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something happened'})
def execute(gpu, exp_batch, exp_alias, dataset_name, validation_set=False): latest = None # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.immutable(False) # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) # If using validation dataset, fix a very high number of hours if validation_set: g_conf.NUMBER_OF_HOURS = 10000 g_conf.immutable(True) # Define the dataset. full_dataset = [ os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) ] augmenter = Augmenter(None) if validation_set: # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_names=[dataset_name]) else: dataset = CoILDataset(full_dataset, transform=augmenter, preload_names=[ str(g_conf.NUMBER_OF_HOURS) + 'hours_' + dataset_name ], train_dataset=True) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) # Define model model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) """ ###### Run a single driving benchmark specified by the checkpoint were validation is stale ###### """ if g_conf.FINISH_ON_VALIDATION_STALE is not None: while validation_stale_point( g_conf.FINISH_ON_VALIDATION_STALE) is None: time.sleep(0.1) validation_state_iteration = validation_stale_point( g_conf.FINISH_ON_VALIDATION_STALE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(validation_state_iteration) + '.pth')) print("Validation loaded ", validation_state_iteration) else: """ ##### Main Loop , Run a benchmark for each specified checkpoint on the "Test Configuration" ##### """ while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): # Get the correct checkpoint # We check it for some task name, all of then are ready at the same time if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]): latest = get_next_checkpoint( g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) print("Validation loaded ", latest) else: time.sleep(0.1) # Load the model and prepare set it for evaluation model.load_state_dict(checkpoint['state_dict']) model.cuda() model.eval() first_iter = True for data in data_loader: # Compute the forward pass on a batch from the dataset and get the intermediate # representations of the squeeze network if "seg" in g_conf.SENSORS.keys(): perception_rep, speed_rep, intentions_rep = \ model.get_intermediate_representations(data, dataset.extract_inputs(data).cuda(), dataset.extract_intentions(data).cuda()) perception_rep = perception_rep.data.cpu() speed_rep = speed_rep.data.cpu() intentions_rep = intentions_rep.data.cpu() if first_iter: perception_rep_all = perception_rep speed_rep_all = speed_rep intentions_rep_all = intentions_rep else: perception_rep_all = torch.cat( [perception_rep_all, perception_rep], 0) speed_rep_all = torch.cat([speed_rep_all, speed_rep], 0) intentions_rep_all = torch.cat( [intentions_rep_all, intentions_rep], 0) first_iter = False # Save intermediate representations perception_rep_all = perception_rep_all.tolist() speed_rep_all = speed_rep_all.tolist() intentions_rep_all = intentions_rep_all.tolist() np.save( os.path.join( '_preloads', exp_batch + '_' + exp_alias + '_' + dataset_name + '_representations'), [perception_rep_all, speed_rep_all, intentions_rep_all])