def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join(exp_batch, exp_alias+'.yaml')) set_type_of_process('train') sys.stdout = open(str(os.getpid()) + ".out", "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias, g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor()])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. sampler = CoILSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data)) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() print(model) criterion = Loss() # TODO: DATASET SIZE SEEMS WEIRD optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file != None: checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] else: iteration = 0 # TODO: The checkpoint will continue, so the logs should restart ??? OR continue were it was print (dataset.meta_data) print (model) for data in data_loader: input_data, labels = data #TODO we have to divide the input with other data. #TODO, ADD ITERATION SCHEDULE input_rgb_data = augmenter(0, input_data['rgb']) # get the control commands from labels, size = [120,1] controls = labels[:, 24, :] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() branches = model(input_rgb_data, labels[:, 10, :].cuda()) #print ("len ",len(branches)) # get the steer, gas and brake ground truth from labels steer_gt = labels[:, 0, :] gas_gt = labels[:, 1, :] brake_gt = labels[:, 2, :] speed_gt = labels[:, 10, :] targets = torch.cat([steer_gt, gas_gt, brake_gt], 1) loss = criterion.MSELoss(branches, targets.cuda(), controls.cuda(), speed_gt.cuda()) loss.backward() optimizer.step() # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict() } # TODO : maybe already summarize the best model ??? torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12, encoder_params=None): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml( os.path.join('configs', exp_batch, exp_alias + '.yaml'), encoder_params) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': os.environ["CUDA_VISIBLE_DEVICES"]}) seed_everything(seed=g_conf.MAGICAL_SEED) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option print(" GOING TO LOAD") if g_conf.PRELOAD_MODEL_ALIAS is not None: print(" LOADING A PRELOAD") checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) else: # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: print('loading previous checkpoint ', checkpoint_file) checkpoint = torch.load( os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 100000000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. #full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # We can save preload dataset depends on the json file name, then no need to load dataset for each time with the same dataset if len(g_conf.EXPERIENCE_FILE) == 1: json_file_name = str( g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2] else: json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split( '/')[-1].split('.')[-2] + '_' + str( g_conf.EXPERIENCE_FILE[1]).split('/')[-1].split('.')[-2] dataset = CoILDataset(transform=augmenter, preload_name=g_conf.PROCESS_NAME + '_' + json_file_name + '_' + g_conf.DATA_USED) #dataset = CoILDataset(transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS)+ 'hours_' + g_conf.TRAIN_DATASET_NAME) print("Loaded Training dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) if g_conf.MODEL_TYPE in ['separate-affordances']: model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.ENCODER_MODEL_CONFIGURATION) model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) print(model) # we use the pre-trained encoder model to extract bottleneck Z and train the E-t-E model if g_conf.MODEL_TYPE in ['separate-affordances']: encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) encoder_model.cuda() encoder_model.eval() # To freeze the pre-trained encoder model if g_conf.FREEZE_ENCODER: for param_ in encoder_model.parameters(): param_.requires_grad = False if encoder_params is not None: encoder_checkpoint = torch.load( os.path.join( '_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints', str(encoder_params['encoder_checkpoint']) + '.pth')) print( "Encoder model ", str(encoder_params['encoder_checkpoint']), "loaded from ", os.path.join('_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints')) encoder_model.load_state_dict(encoder_checkpoint['state_dict']) if g_conf.FREEZE_ENCODER: encoder_model.eval() # To freeze the pre-trained encoder model for param_ in encoder_model.parameters(): param_.requires_grad = False else: optimizer = optim.Adam(list(model.parameters()) + list(encoder_model.parameters()), lr=g_conf.LEARNING_RATE) for name_encoder, param_encoder in encoder_model.named_parameters( ): if param_encoder.requires_grad: print(' Unfrozen layers', name_encoder) else: print(' Frozen layers', name_encoder) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] for name, param in model.named_parameters(): if param.requires_grad: print(' Unfrozen layers', name) else: print(' Frozen layers', name) print("Before the loss") # Loss time series window for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) model.zero_grad() if not g_conf.FREEZE_ENCODER: encoder_model.zero_grad() if g_conf.LABELS_SUPERVISED: inputs_data = torch.cat( (data['rgb'], torch.zeros(g_conf.BATCH_SIZE, 1, 88, 200)), dim=1).cuda() else: inputs_data = torch.squeeze(data['rgb'].cuda()) if g_conf.MODEL_TYPE in ['separate-affordances']: #TODO: for this two encoder models training, we haven't put speed as input to train yet if g_conf.ENCODER_MODEL_TYPE in [ 'action_prediction', 'stdim', 'forward', 'one-step-affordances' ]: e, inter = encoder_model.forward_encoder( inputs_data, dataset.extract_inputs(data).cuda(), # We also add measurements and commands torch.squeeze(dataset.extract_commands(data).cuda())) elif g_conf.ENCODER_MODEL_TYPE in ['ETE']: e, inter = encoder_model.forward_encoder( inputs_data, dataset.extract_inputs(data).cuda(), torch.squeeze(dataset.extract_commands(data).cuda())) loss_function_params = { 'classification_gt': dataset.extract_affordances_targets( data, 'classification').cuda(), # harzard stop, red_light.... 'class_weights': g_conf.AFFORDANCES_CLASS_WEIGHT, 'regression_gt': dataset.extract_affordances_targets(data, 'regression').cuda(), 'variable_weights': g_conf.AFFORDANCES_VARIABLE_WEIGHT } loss = model(e, loss_function_params) loss.backward() optimizer.step() else: raise RuntimeError( 'Not implement yet, this branch is only work for g_conf.MODEL_TYPE in [separate-affordances]' ) """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(iteration) + '.pth')) if not g_conf.FREEZE_ENCODER: encoder_state = { 'iteration': iteration, 'state_dict': encoder_model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( encoder_state, os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(iteration) + '_encoder.pth')) iteration += 1 """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration if iteration % 100 == 0: print('Train Iteration: {} [{}/{} ({:.0f}%)] \t Loss: {:.6f}'. format(iteration, iteration, g_conf.NUMBER_ITERATIONS, 100. * iteration / g_conf.NUMBER_ITERATIONS, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, state_dict, suppress_output=True, number_of_workers=12): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': gpu}) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # Instantiate the class used to read a dataset. The coil dataset generator # can be found dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME) print("Loaded dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) model.cuda() if state_dict != '': seg_model = ERFNet_Fast(2) seg_model = load_my_state_dict(seg_model, torch.load(state_dict)) seg_model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] print("Before the loss") criterion = Loss(g_conf.LOSS_FUNCTION) color_transforms = Colorizes(2) board = Dashboard(8097) # Loss time series window for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ iteration += 1 if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) # get the control commands from float_data, size = [120,1] capture_time = time.time() controls = data['directions'] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() if state_dict != '': with torch.no_grad(): repre = seg_model(torch.squeeze(data['rgb'].cuda()), only_encode=False) inputs = repre imgs = color_transforms(inputs) inputs = inputs.float().cuda() else: inputs = torch.squeeze(data['rgb'].cuda()) # vis board.image( torch.squeeze(data['rgb'])[0].cpu().data, '(train) input iter: ' + str(iteration)) board.image(imgs[0].cpu().data, '(train) output iter: ' + str(iteration)) branches = model(inputs, dataset.extract_inputs(data).cuda()) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), 'controls': controls.cuda(), 'inputs': dataset.extract_inputs(data).cuda(), 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) loss.backward() optimizer.step() """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(data).cuda()) accumulated_time += time.time() - capture_time coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data)[position].data.tolist() }, iteration) loss_window.append(loss.data.tolist()) coil_logger.write_on_error_csv('train', loss.data) print("Iteration: %d Loss: %f" % (iteration, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: gpus ids for training exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpu) g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': gpu}) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] print('iteration: ', iteration, 'best_loss: ', best_loss) else: iteration = 0 best_loss = 10000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # Instantiate the class used to read the dataset dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME) print("Loaded dataset") # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # define the sampling strategy for mini-batch, different samplers can be found in 'splitter.py' data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) # Instatiate the network architecture model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE ) # adabound and adamio can also be used here if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] # freeze the perception module weights if required # for m in model.perception.parameters(): # m.requires_grad = False # total trainable parameters model_parameters = filter(lambda p: p.requires_grad, model.parameters()) total_params = sum([np.prod(p.size()) for p in model_parameters]) print('trainable parameters: ', total_params) # multi-gpu print('number of gpus: ', torch.cuda.device_count()) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) criterion = Loss(g_conf.LOSS_FUNCTION) print('Start Training') st = time.time() for data in data_loader: # use this for early stopping if the validation loss is not coming down if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ iteration += 1 if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) # additional learning rate scheduler - cyclic cosine annealing (https://arxiv.org/pdf/1704.00109.pdf) # adjust_learning_rate_cosine_annealing(optimizer, loss_window, iteration) capture_time = time.time() controls = data['directions'] model.zero_grad() branches = model(torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda()) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), 'controls': controls.cuda(), 'inputs': dataset.extract_inputs(data).cuda(), 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) loss.backward() optimizer.step() """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): if torch.cuda.device_count() > 1: state_dict_save = model.module.state_dict() else: state_dict_save = model.state_dict() state = { 'iteration': iteration, 'state_dict': state_dict_save, 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(data) - 1) if torch.cuda.device_count() > 1: output = model.module.extract_branch( torch.stack(branches[0:4]), controls) else: output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(data).cuda()) accumulated_time += time.time() - capture_time coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data)[position].data.tolist() }, iteration) loss_window.append(loss.data.tolist()) coil_logger.write_on_error_csv('train', loss.data) print("Iteration: %d Loss: %f" % (iteration, loss.data)) st = time.time() coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, dataset_name): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = '0' # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) print(full_dataset) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() model.eval() criterion = Loss() latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 latest = 200000 best_loss = 1000.0 best_error = 1000.0 best_loss_iter = 0 best_error_iter = 0 print(dataset.meta_data[0][0]) for k in dataset.meta_data: k[0] = str(k[0], 'utf-8') print(dataset.meta_data[0][0]) cpts = glob.glob( '/home-local/rohitrishabh/coil_20-06/_logs/eccv/experiment_1/checkpoints/*.pth' ) # while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): for ckpt in cpts: # if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) latest = int(ckpt[-10:-4]) # checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias # , 'checkpoints', str(latest) + '.pth')) checkpoint = torch.load(ckpt) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) accumulated_loss = 0.0 accumulated_error = 0.0 iteration_on_checkpoint = 0 for data in data_loader: input_data, float_data = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] # print (torch.squeeze(input_data['rgb']).shape) # print (control_position) # print (speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), float_data[:, speed_position, :].cuda(), float_data[:, control_position, :].cuda()) for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) # TODO: Change this a functional standard using the loss functions. loss = torch.mean( (output - dataset.extract_targets(float_data).cuda())**2).data.tolist() mean_error = torch.mean( torch.abs( output - dataset.extract_targets(float_data).cuda())).data.tolist() accumulated_error += mean_error accumulated_loss += loss error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # Log a random position position = random.randint(0, len(float_data) - 1) #print (output[position].data.tolist()) coil_logger.add_message( 'Iterating in Validation', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'Loss': loss, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(float_data) [position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, latest) iteration_on_checkpoint += 1 checkpoint_average_loss = accumulated_loss / len(dataset) checkpoint_average_error = accumulated_error / len(dataset) coil_logger.add_scalar('Loss', checkpoint_average_loss, latest) coil_logger.add_scalar('Error', checkpoint_average_error, latest) print('Loss: ', checkpoint_average_loss, "----Error: ", checkpoint_average_error) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest state = { 'state_dict': model.state_dict(), 'best_loss': best_loss, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l2' + '.pth')) if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest state = { 'state_dict': model.state_dict(), 'best_error': best_error, 'best_error_iter': best_error_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l1' + '.pth')) print('Best Loss: ', best_loss, "Checkpoint", best_loss_iter) print('Best Error: ', best_error, "Checkpoint", best_error_iter) coil_logger.add_message( 'Iterating in Validation', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_loss, 'BestError': best_error, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest })
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': gpu}) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join('_output_logs', exp_alias + '_err_'+g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load(os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT)+'.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # Instantiate the class used to read a dataset. The coil dataset generator # can be found dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME) print ("Loaded dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) # Set ERFnet for segmentation model_erf = ERFNet(20) model_erf = torch.nn.DataParallel(model_erf) model_erf = model_erf.cuda() print("LOAD ERFNet") def load_my_state_dict(model, state_dict): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model_erf = load_my_state_dict(model_erf, torch.load(os.path.join('trained_models/erfnet_pretrained.pth'))) model_erf.eval() print ("ERFNet and weights LOADED successfully") if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] print ("Before the loss") criterion = Loss(g_conf.LOSS_FUNCTION) # Loss time series window for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ iteration += 1 if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) # get the control commands from float_data, size = [120,1] capture_time = time.time() controls = data['directions'] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() # print("Segmentation") # use ERFNet to convert RGB to Segmentation rgbs = data['rgb'] filenames = data['rgb_name'] # # seg one by one # seg_road = [] # seg_not_road = [] # i = 0 # for inputs in rgbs: # inputs = inputs.unsqueeze(0) # # print("inputs ",inputs.shape) # with torch.no_grad(): # outputs = model_erf(inputs) # label = outputs[0].max(0)[1].byte().cpu().data # road = (label == 0) # not_road = (label != 0) # seg_road.append(road) # seg_not_road.append(not_road) # # # print("label ",label.shape) # # label_color = Colorize()(label.unsqueeze(0)) # # filename = filenames[i] # # filenameSave = "./save_color/" + filename.split("CoILTrain/")[1] # # os.makedirs(os.path.dirname(filenameSave), exist_ok=True) # # label_save = ToPILImage()(label_color) # # label_save.save(filenameSave) # # # print (i, filenameSave) # # i += 1 # seg_road = torch.stack(seg_road) # seg_not_road = torch.stack(seg_not_road) # seg = torch.stack([seg_road,seg_not_road]).transpose(0,1).float() # # print(seg.shape) # seg batch with torch.no_grad(): outputs = model_erf(rgbs) # print("outputs.shape ",outputs.shape) labels = outputs.max(1)[1].byte().cpu().data # print("labels.shape",labels.shape) # print(np.unique(labels[0])) seg_road = (labels==0) seg_not_road = (labels!=0) seg = torch.stack((seg_road,seg_not_road),1).float() # save 1st batch's segmentation results if iteration == 1: for i in range(120): label = seg[i,0,:,:] label_color = Colorize()(label.unsqueeze(0)) filenameSave = "./save_color/batch_road_mask/%d.png"%(i) os.makedirs(os.path.dirname(filenameSave), exist_ok=True) label_save = ToPILImage()(label_color) label_save.save(filenameSave) label = labels[i,:,:] label_color = Colorize()(label.unsqueeze(0)) filenameSave = "./save_color/batch_road/%d.png"%(i) os.makedirs(os.path.dirname(filenameSave), exist_ok=True) label_save = ToPILImage()(label_color) label_save.save(filenameSave) branches = model(torch.squeeze(seg).cuda(), dataset.extract_inputs(data).cuda()) # branches = model(torch.squeeze(rgbs.cuda()), # dataset.extract_input(data).cuda()) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), 'controls': controls.cuda(), 'inputs': dataset.extract_inputs(data).cuda(), 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) loss.backward() optimizer.step() """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(data).cuda()) accumulated_time += time.time() - capture_time coil_logger.add_message('Iterating', {'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(data)[ position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data)[ position].data.tolist()}, iteration) loss_window.append(loss.data.tolist()) coil_logger.write_on_error_csv('train', loss.data) print("Iteration: %d Loss: %f" % (iteration, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': gpu}) # Seed RNGs torch.manual_seed(g_conf.MAGICAL_SEED) random.seed(g_conf.MAGICAL_SEED) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 best_loss_iter = 0 # Define the dataset. # Can specify a list of training datasets or just a single training dataset if len(g_conf.TRAIN_DATASET_NAMES) == 0: train_dataset_list = [g_conf.TRAIN_DATASET_NAME] else: train_dataset_list = g_conf.TRAIN_DATASET_NAMES full_dataset = [ os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) for dataset_name in train_dataset_list ] # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # Instantiate the class used to read a dataset. The coil dataset generator # can be found dataset = CoILDataset(full_dataset, transform=augmenter, preload_names=[ str(g_conf.NUMBER_OF_HOURS) + 'hours_' + dataset_name for dataset_name in train_dataset_list ], train_dataset=True) print("Loaded dataset") # Create dataloader, model, and optimizer data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) # If we have a previous checkpoint, load model, optimizer, and record of previous # train loss values (used for the learning rate schedule) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] print("Before the loss") # Define control loss function criterion = Loss(g_conf.LOSS_FUNCTION) if iteration == 0 and is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) # Training loop for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ iteration += 1 # Adjust learning rate based on training loss if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) capture_time = time.time() model.zero_grad() controls = data['directions'] # Run model forward and get outputs # First case corresponds to training squeeze network, second case corresponds to training driving model without # mimicking losses, last case corresponds to training mimic network if "seg" in g_conf.SENSORS.keys(): branches = model(data, dataset.extract_inputs(data).cuda(), dataset.extract_intentions(data).cuda()) elif not g_conf.USE_REPRESENTATION_LOSS: branches = model(data, dataset.extract_inputs(data).cuda()) else: branches, intermediate_reps = model( data, dataset.extract_inputs(data).cuda()) # Compute control loss targets_to_use = dataset.extract_targets(data) loss_function_params = { 'branches': branches, 'targets': targets_to_use.cuda(), 'controls': controls.cuda(), 'inputs': dataset.extract_inputs(data).cuda(), 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) # Compute mimicking loss if g_conf.USE_REPRESENTATION_LOSS: expert_reps = dataset.extract_representations(data) # Seg mask mimicking loss if g_conf.USE_PERCEPTION_REP_LOSS: perception_rep_loss_elementwise = ( intermediate_reps[0] - expert_reps[0].cuda())**2 perception_rep_loss = g_conf.PERCEPTION_REP_WEIGHT * torch.sum( perception_rep_loss_elementwise) / branches[0].shape[0] else: perception_rep_loss = torch.tensor(0.).cuda() # Speed mimicking loss if g_conf.USE_SPEED_REP_LOSS: speed_rep_loss_elementwise = (intermediate_reps[1] - expert_reps[1].cuda())**2 speed_rep_loss = g_conf.SPEED_REP_WEIGHT * torch.sum( speed_rep_loss_elementwise) / branches[0].shape[0] else: speed_rep_loss = torch.tensor(0.).cuda() # Stop intentions mimicking loss if g_conf.USE_INTENTION_REP_LOSS: intentions_rep_loss_elementwise = ( intermediate_reps[2] - expert_reps[2].cuda())**2 intentions_rep_loss = g_conf.INTENTIONS_REP_WEIGHT * torch.sum( intentions_rep_loss_elementwise) / branches[0].shape[0] else: intentions_rep_loss = torch.tensor(0.).cuda() rep_loss = g_conf.REP_LOSS_WEIGHT * ( perception_rep_loss + speed_rep_loss + intentions_rep_loss) overall_loss = loss + rep_loss else: overall_loss = loss overall_loss.backward() optimizer.step() """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) if g_conf.USE_REPRESENTATION_LOSS: coil_logger.add_scalar('Perception Rep Loss', perception_rep_loss.data, iteration) coil_logger.add_scalar('Speed Rep Loss', speed_rep_loss.data, iteration) coil_logger.add_scalar('Intentions Rep Loss', intentions_rep_loss.data, iteration) coil_logger.add_scalar('Overall Rep Loss', rep_loss.data, iteration) coil_logger.add_scalar('Total Loss', overall_loss.data, iteration) if 'rgb' in data: coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if overall_loss.data < best_loss: best_loss = overall_loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - targets_to_use.cuda()) accumulated_time += time.time() - capture_time # Log to terminal and log file if g_conf.USE_REPRESENTATION_LOSS: coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': overall_loss.data.tolist(), 'Control Loss': loss.data.tolist(), 'Rep Loss': rep_loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': targets_to_use[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data)[position].data.tolist() }, iteration) else: coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': targets_to_use[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data)[position].data.tolist() }, iteration) # Save training loss history (useful for restoring training runs since learning rate is adjusted # based on training loss) loss_window.append(overall_loss.data.tolist()) coil_logger.write_on_error_csv('train', overall_loss.data) print("Iteration: %d Loss: %f" % (iteration, overall_loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices try: os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) #augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU) dataset = CoILDataset(full_dataset, transform=transforms.Compose( [transforms.ToTensor()])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=12, pin_memory=False) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. st = lambda aug: iag.Sometimes(aug, 0.4) oc = lambda aug: iag.Sometimes(aug, 0.3) rl = lambda aug: iag.Sometimes(aug, 0.09) augmenter = iag.Augmenter([iag.ToGPU()] + [ rl(iag.GaussianBlur( (0, 1.5))), # blur images with a sigma between 0 and 1.5 rl( iag.AdditiveGaussianNoise( loc=0, scale=(0.0, 0.05), per_channel=0.5)), # add gaussian noise to images oc(iag.Dropout((0.0, 0.10), per_channel=0.5) ), # randomly remove up to X% of the pixels oc( iag.CoarseDropout( (0.0, 0.10), size_percent=(0.08, 0.2), per_channel=0.5) ), # randomly remove up to X% of the pixels oc(iag.Add((-40, 40), per_channel=0.5) ), # change brightness of images (by -X to Y of original value) st(iag.Multiply((0.10, 2), per_channel=0.2) ), # change brightness of images (X-Y% of original value) rl(iag.ContrastNormalization(( 0.5, 1.5), per_channel=0.5)), # improve or worsen the contrast rl(iag.Grayscale((0.0, 1))), # put grayscale ] # do all of the above in random order ) # augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() print(model) criterion = Loss() # TODO: DATASET SIZE SEEMS WEIRD optimizer = optim.Adam(model.parameters(), lr=0.0002) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file != None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] accumulated_time = checkpoint['total_time'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 accumulated_time = 0 # We accumulate iteration time and keep the average speed best_loss_iter = 0 # TODO: The checkpoint will continue, so it should erase everything up to the iteration best_loss_save = 10000.0 best_loss_save_iter = 0 curr_loss_save = 0.0 print(dataset.meta_data) print(model) capture_time = time.time() model.train() for data in data_loader: input_data, float_data = data #TODO, ADD ITERATION SCHEDULE input_rgb_data = augmenter(0, input_data['rgb']) augment_for_controls = 1 adjustlr = 1 if augment_for_controls: #and self._config.targets_names[j] == "Steer": camera_angle = float_data[:, 26, :] camera_angle = camera_angle.cuda( ) #self._config.variable_names.index('Angle'),i] print("Camera angle", camera_angle[0]) steer = float_data[:, 0, :] # print("Original", steer[0]) steer = steer.cuda() speed = float_data[:, 10, :] speed = speed.cuda() # print (steer) time_use = 1.0 car_length = 3.0 extra_factor = 2.5 threshold = 1.0 pos = camera_angle > 0.0 pos = pos.type(torch.FloatTensor) neg = camera_angle <= 0.0 neg = neg.type(torch.FloatTensor) pos = pos.cuda() neg = neg.cuda() rad_camera_angle = math.pi * (torch.abs(camera_angle)) / 180.0 val = extra_factor * (torch.atan( (rad_camera_angle * car_length) / (time_use * speed + 0.05))) / 3.1415 # print(val) steer -= pos * torch.min(val, torch.tensor([0.6]).cuda()) steer += neg * torch.min(val, torch.tensor([0.6]).cuda()) print("val", val[0]) print("speed", speed[0]) steer = steer.cpu() float_data[:, 0, :] = steer float_data[:, 0, :][float_data[:, 0, :] > 1.0] = 1.0 float_data[:, 0, :][float_data[:, 0, :] < -1.0] = -1.0 #coil_logger.add_images(input_rgb_data) # get the control commands from float_data, size = [120,1] controls = float_data[:, dataset.controls_position(), :] # print(" CONTROLS ", controls.shape) # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() # print ( 'INPUTS', dataset.extract_inputs(float_data).shape ) branches = model(input_rgb_data, dataset.extract_inputs(float_data).cuda()) #print ("len ",len(branches)) #targets = torch.cat([steer_gt, gas_gt, brake_gt], 1) # print ("Extracted targets ", dataset.extract_targets(float_data).shape[0]) loss = criterion.MSELoss( branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) # TODO: All these logging things could go out to clean up the main if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration curr_loss_save += loss.data # Log a random position position = random.randint(0, len(float_data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # TODO: For now we are computing the error for just the correct branch, it could be multi- branch, coil_logger.add_scalar('Loss', loss.data, iteration) loss.backward() optimizer.step() accumulated_time += time.time() - capture_time capture_time = time.time() # TODO: Get only the float_data that are actually generating output # TODO: itearation is repeating , and that is dumb coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'BestLossSave': best_loss_save, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( float_data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, iteration) # TODO: For now we are computing the error for just the correct branch, it could be multi-branch, # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) print("before best save") if iteration % 5 == 0 and iteration > 4: curr_loss_save /= 5000.0 if curr_loss_save < best_loss_save: best_loss_save = curr_loss_save curr_loss_save = 0 state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss_save, 'total_time': accumulated_time, 'best_loss_iter': best_loss_save_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_loss_save' + '.pth')) print("after best save") if iteration == best_loss_iter: state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_loss' + '.pth')) iteration += 1 if adjustlr and iteration % 1000: adjust_learning_rate(optimizer, iteration) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices try: os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) #augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU) dataset = CoILDataset(full_dataset, transform=transforms.Compose( [transforms.ToTensor()])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=12, pin_memory=True) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() exit() print(model) criterion = Loss() # TODO: DATASET SIZE SEEMS WEIRD optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file != None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] accumulated_time = checkpoint['total_time'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 accumulated_time = 0 # We accumulate iteration time and keep the average speed best_loss_iter = 0 # TODO: The checkpoint will continue, so it should erase everything up to the iteration print(dataset.meta_data) print(model) capture_time = time.time() for data in data_loader: input_data, float_data = data #TODO, ADD ITERATION SCHEDULE input_rgb_data = augmenter(0, input_data['rgb']) #coil_logger.add_images(input_rgb_data) # get the control commands from float_data, size = [120,1] controls = float_data[:, dataset.controls_position(), :] print(" CONTROLS ", controls.shape) # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() print('INPUTS', dataset.extract_inputs(float_data).shape) branches = model(input_rgb_data, dataset.extract_inputs(float_data).cuda()) #print ("len ",len(branches)) #targets = torch.cat([steer_gt, gas_gt, brake_gt], 1) print("Extracted targets ", dataset.extract_targets(float_data).shape[0]) loss = criterion.MSELoss( branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) # TODO: All these logging things could go out to clean up the main if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(float_data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # TODO: For now we are computing the error for just the correct branch, it could be multi- branch, coil_logger.add_scalar('Loss', loss.data, iteration) loss.backward() optimizer.step() accumulated_time += time.time() - capture_time capture_time = time.time() # TODO: Get only the float_data that are actually generating output # TODO: itearation is repeating , and that is dumb coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( float_data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, iteration) # TODO: For now we are computing the error for just the correct branch, it could be multi-branch, # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) iteration += 1 except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, suppress_output=True): # We set the visible cuda devices # TODO: probable race condition, the train has to be started before. try: os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') # Put the output to a separate file if suppress_output: sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 best_loss_iter = 0 # TODO: The checkpoint will continue, so it should erase everything up to the iteration on tensorboard # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the HD_FILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) dataset = CoILDataset(full_dataset, transform=augmenter) data_loader = select_balancing_strategy(dataset, iteration) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) model.cuda() if checkpoint_file is not None: model.load_state_dict(checkpoint['state_dict']) print(model) criterion = Loss(g_conf.LOSS_FUNCTION) optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) print(dataset.meta_data) print(model) if checkpoint_file is not None: accumulated_time = checkpoint['total_time'] else: accumulated_time = 0 # We accumulate iteration time and keep the average speed #TODO: test experiment continuation. Is the data sampler going to continue were it started.. ? capture_time = time.time() for data in data_loader: input_data, float_data = data # get the control commands from float_data, size = [120,1] controls = float_data[:, dataset.controls_position(), :] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() branches = model(torch.squeeze(input_data['rgb'].cuda()), dataset.extract_inputs(float_data).cuda()) loss = criterion(branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda(), branch_weights=g_conf.BRANCH_LOSS_WEIGHT, variable_weights=g_conf.VARIABLE_WEIGHT) # TODO: All these logging things could go out to clean up the main if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(float_data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # TODO: For now we are computing the error for just the correct branch, it could be multi- branch, coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(input_data['rgb']), iteration) loss.backward() optimizer.step() accumulated_time += time.time() - capture_time capture_time = time.time() # TODO: Get only the float_data that are actually generating output # TODO: itearation is repeating , and that is dumb coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( float_data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, iteration) # TODO: For now we are computing the error for just the correct branch, it could be multi-branch, # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) iteration += 1 print(iteration) if iteration % 1000 == 0: adjust_learning_rate(optimizer, iteration) del data coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})