def execute(gpu, exp_batch, exp_alias, dataset_name): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = '0' # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) print(full_dataset) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() model.eval() criterion = Loss() latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 latest = 200000 best_loss = 1000.0 best_error = 1000.0 best_loss_iter = 0 best_error_iter = 0 print(dataset.meta_data[0][0]) for k in dataset.meta_data: k[0] = str(k[0], 'utf-8') print(dataset.meta_data[0][0]) cpts = glob.glob( '/home-local/rohitrishabh/coil_20-06/_logs/eccv/experiment_1/checkpoints/*.pth' ) # while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): for ckpt in cpts: # if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) latest = int(ckpt[-10:-4]) # checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias # , 'checkpoints', str(latest) + '.pth')) checkpoint = torch.load(ckpt) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) accumulated_loss = 0.0 accumulated_error = 0.0 iteration_on_checkpoint = 0 for data in data_loader: input_data, float_data = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] # print (torch.squeeze(input_data['rgb']).shape) # print (control_position) # print (speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), float_data[:, speed_position, :].cuda(), float_data[:, control_position, :].cuda()) for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) # TODO: Change this a functional standard using the loss functions. loss = torch.mean( (output - dataset.extract_targets(float_data).cuda())**2).data.tolist() mean_error = torch.mean( torch.abs( output - dataset.extract_targets(float_data).cuda())).data.tolist() accumulated_error += mean_error accumulated_loss += loss error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # Log a random position position = random.randint(0, len(float_data) - 1) #print (output[position].data.tolist()) coil_logger.add_message( 'Iterating in Validation', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'Loss': loss, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(float_data) [position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, latest) iteration_on_checkpoint += 1 checkpoint_average_loss = accumulated_loss / len(dataset) checkpoint_average_error = accumulated_error / len(dataset) coil_logger.add_scalar('Loss', checkpoint_average_loss, latest) coil_logger.add_scalar('Error', checkpoint_average_error, latest) print('Loss: ', checkpoint_average_loss, "----Error: ", checkpoint_average_error) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest state = { 'state_dict': model.state_dict(), 'best_loss': best_loss, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l2' + '.pth')) if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest state = { 'state_dict': model.state_dict(), 'best_error': best_error, 'best_error_iter': best_error_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l1' + '.pth')) print('Best Loss: ', best_loss, "Checkpoint", best_loss_iter) print('Best Error: ', best_error, "Checkpoint", best_error_iter) coil_logger.add_message( 'Iterating in Validation', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_loss, 'BestError': best_error, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest })
def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output): latest = None try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) # The validation dataset is always fully loaded, so we fix a very high number of hours g_conf.NUMBER_OF_HOURS = 10000 set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') if suppress_output: sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=dataset_name) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) # The window used to keep track of the trainings l1_window = [] latest = get_latest_evaluated_checkpoint() if latest is not None: # When latest is noe l1_window = coil_logger.recover_loss_window(dataset_name, None) model.cuda() best_mse = 1000 best_error = 1000 best_mse_iter = 0 best_error_iter = 0 while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) model.load_state_dict(checkpoint['state_dict']) model.eval() accumulated_mse = 0 accumulated_error = 0 iteration_on_checkpoint = 0 for data in data_loader: # Compute the forward pass on a batch from the validation dataset controls = data['directions'] output = model.forward_branch( torch.squeeze(data['rgb']).cuda(), dataset.extract_inputs(data).cuda(), controls) # It could be either waypoints or direct control if 'waypoint1_angle' in g_conf.TARGETS: write_waypoints_output(checkpoint_iteration, output) else: write_regular_output(checkpoint_iteration, output) mse = torch.mean( (output - dataset.extract_targets(data).cuda() )**2).data.tolist() mean_error = torch.mean( torch.abs(output - dataset.extract_targets(data).cuda()) ).data.tolist() accumulated_error += mean_error accumulated_mse += mse error = torch.abs(output - dataset.extract_targets(data).cuda()) # Log a random position position = random.randint(0, len(output.data.tolist()) - 1) coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'MSE': mse, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data) [position].data.tolist() }, latest) iteration_on_checkpoint += 1 print("Iteration %d on Checkpoint %d : Error %f" % (iteration_on_checkpoint, checkpoint_iteration, mean_error)) """ ######## Finish a round of validation, write results, wait for the next ######## """ checkpoint_average_mse = accumulated_mse / (len(data_loader)) checkpoint_average_error = accumulated_error / ( len(data_loader)) coil_logger.add_scalar('Loss', checkpoint_average_mse, latest, True) coil_logger.add_scalar('Error', checkpoint_average_error, latest, True) if checkpoint_average_mse < best_mse: best_mse = checkpoint_average_mse best_mse_iter = latest if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest coil_logger.add_message( 'Iterating', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_mse, 'BestError': best_error, 'BestMSE': best_mse, 'BestMSECheckpoint': best_mse_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest }, latest) l1_window.append(checkpoint_average_error) coil_logger.write_on_error_csv(dataset_name, checkpoint_average_error) # If we are using the finish when validation stops, we check the current if g_conf.FINISH_ON_VALIDATION_STALE is not None: if dlib.count_steps_without_decrease(l1_window) > 3 and \ dlib.count_steps_without_decrease_robust(l1_window) > 3: coil_logger.write_stop(dataset_name, latest) break else: latest = get_latest_evaluated_checkpoint() time.sleep(1) coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'}) print("Waiting for the next Validation") coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join(exp_batch, exp_alias + '.yaml')) set_type_of_process('validation') sys.stdout = open(str(os.getpid()) + ".out", "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias, g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() # TODO: The checkpoint will continue, so the logs should restart ??? OR continue were it was latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 print(dataset.meta_data) while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) for data in data_loader: input_data, labels = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] print(torch.squeeze(input_data['rgb']).shape) print(control_position) print(speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), labels[:, speed_position, :].cuda(), labels[:, control_position, :].cuda()) # TODO: clean this squeeze and dimension things for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) #loss = criterion(output, labels) #loss.backward() #optimizer.step() #shutil.copyfile(filename, 'model_best.pth.tar') else: time.sleep(1) print("Waiting for the next Validation")
def execute(gpu, exp_batch, exp_alias, drive_conditions, params): """ Main loop function. Executes driving benchmarks the specified iterations. Args: gpu: exp_batch: exp_alias: drive_conditions: params: Returns: """ try: print("Running ", __file__, " On GPU ", gpu, "of experiment name ", exp_alias) os.environ["CUDA_VISIBLE_DEVICES"] = gpu if not os.path.exists('_output_logs'): os.mkdir('_output_logs') merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) exp_set_name, town_name = drive_conditions.split('_') experiment_suite_module = __import__('drive.suites.' + camelcase_to_snakecase(exp_set_name) + '_suite', fromlist=[exp_set_name]) experiment_suite_module = getattr(experiment_suite_module, exp_set_name) experiment_set = experiment_suite_module() set_type_of_process('drive', drive_conditions) if params['suppress_output']: sys.stdout = open(os.path.join('_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join('_output_logs', exp_alias + '_err_'+g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) coil_logger.add_message('Loading', {'Poses': experiment_set.build_experiments()[0].poses}) if g_conf.USE_ORACLE: control_filename = 'control_output_auto' else: control_filename = 'control_output' """ ##### Preparing the output files that will contain the driving summary ##### """ experiment_list = experiment_set.build_experiments() # Get all the uniquely named tasks task_list = unique([experiment.task_name for experiment in experiment_list ]) # Now actually run the driving_benchmark latest = get_latest_evaluated_checkpoint(control_filename + '_' + task_list[0]) if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 # The used tasks are hardcoded, this need to be improved file_base = os.path.join('_logs', exp_batch, exp_alias, g_conf.PROCESS_NAME + '_csv', control_filename) for i in range(len(task_list)): # Write the header of the summary file used conclusion # While the checkpoint is not there write_header_control_summary(file_base, task_list[i]) """ ###### Run a single driving benchmark specified by the checkpoint were validation is stale ###### """ if g_conf.FINISH_ON_VALIDATION_STALE is not None: while validation_stale_point(g_conf.FINISH_ON_VALIDATION_STALE) is None: time.sleep(0.1) validation_state_iteration = validation_stale_point(g_conf.FINISH_ON_VALIDATION_STALE) driving_benchmark(validation_state_iteration, gpu, town_name, experiment_set, exp_batch, exp_alias, params, control_filename, task_list) else: """ ##### Main Loop , Run a benchmark for each specified checkpoint on the "Test Configuration" ##### """ while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): # Get the correct checkpoint # We check it for some task name, all of then are ready at the same time if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]) driving_benchmark(latest, gpu, town_name, experiment_set, exp_batch, exp_alias, params, control_filename, task_list) else: time.sleep(0.1) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something happened'})
def execute(gpu, exp_batch, exp_alias, drive_conditions, memory_use=0.2, host='127.0.0.1', suppress_output=True, no_screen=False): try: print("Running ", __file__, " On GPU ", gpu, "of experiment name ", exp_alias) os.environ["CUDA_VISIBLE_DEVICES"] = gpu if not os.path.exists('_output_logs'): os.mkdir('_output_logs') merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) print("drive cond", drive_conditions) exp_set_name, town_name = drive_conditions.split('_') if g_conf.USE_ORACLE: control_filename = 'control_output_auto.csv' else: control_filename = 'control_output.csv' if exp_set_name == 'ECCVTrainingSuite': experiment_set = ECCVTrainingSuite() set_type_of_process('drive', drive_conditions) elif exp_set_name == 'ECCVGeneralizationSuite': experiment_set = ECCVGeneralizationSuite() set_type_of_process('drive', drive_conditions) elif exp_set_name == 'TestT1': experiment_set = TestT1() set_type_of_process('drive', drive_conditions) elif exp_set_name == 'TestT2': experiment_set = TestT2() set_type_of_process('drive', drive_conditions) else: raise ValueError(" Exp Set name is not correspondent to a city") if suppress_output: sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) #sys.stderr = open(os.path.join('_output_logs', # 'err_'+g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), # "a", buffering=1) carla_process, port = start_carla_simulator(gpu, town_name, no_screen) coil_logger.add_message( 'Loading', {'Poses': experiment_set.build_experiments()[0].poses}) coil_logger.add_message('Loading', {'CARLAClient': host + ':' + str(port)}) # Now actually run the driving_benchmark latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 csv_outfile = open( os.path.join('_logs', exp_batch, exp_alias, g_conf.PROCESS_NAME + '_csv', control_filename), 'w') csv_outfile.write( "%s,%s,%s,%s,%s,%s,%s,%s\n" % ('step', 'episodes_completion', 'intersection_offroad', 'intersection_otherlane', 'collision_pedestrians', 'collision_vehicles', 'episodes_fully_completed', 'driven_kilometers')) csv_outfile.close() # Write the header of the summary file used conclusion # While the checkpoint is not there while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): try: # Get the correct checkpoint if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) coil_agent = CoILAgent(checkpoint, town_name) coil_logger.add_message('Iterating', {"Checkpoint": latest}, latest) run_driving_benchmark( coil_agent, experiment_set, town_name, exp_batch + '_' + exp_alias + '_' + str(latest) + '_drive_' + control_filename[:-4], True, host, port) path = exp_batch + '_' + exp_alias + '_' + str(latest) \ + '_' + g_conf.PROCESS_NAME.split('_')[0] + '_' + control_filename[:-4] \ + '_' + g_conf.PROCESS_NAME.split('_')[1] + '_' + g_conf.PROCESS_NAME.split('_')[2] print(path) print("Finished") benchmark_json_path = os.path.join(get_latest_path(path), 'metrics.json') with open(benchmark_json_path, 'r') as f: benchmark_dict = json.loads(f.read()) averaged_dict = compute_average_std( [benchmark_dict], experiment_set.weathers, len(experiment_set.build_experiments())) print(averaged_dict) csv_outfile = open( os.path.join('_logs', exp_batch, exp_alias, g_conf.PROCESS_NAME + '_csv', control_filename), 'a') csv_outfile.write( "%d,%f,%f,%f,%f,%f,%f,%f\n" % (latest, averaged_dict['episodes_completion'], averaged_dict['intersection_offroad'], averaged_dict['intersection_otherlane'], averaged_dict['collision_pedestrians'], averaged_dict['collision_vehicles'], averaged_dict['episodes_fully_completed'], averaged_dict['driven_kilometers'])) csv_outfile.close() # TODO: When you add the message you need to check if the experiment continues properly # TODO: WRITE AN EFICIENT PARAMETRIZED OUTPUT SUMMARY FOR TEST. else: time.sleep(0.1) except TCPConnectionError as error: logging.error(error) time.sleep(1) carla_process.kill() coil_logger.add_message('Error', {'Message': 'TCP serious Error'}) exit(1) except KeyboardInterrupt: carla_process.kill() coil_logger.add_message('Error', {'Message': 'Killed By User'}) exit(1) except: traceback.print_exc() carla_process.kill() coil_logger.add_message('Error', {'Message': 'Something Happened'}) exit(1) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: traceback.print_exc() carla_process.kill() coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() carla_process.kill() coil_logger.add_message('Error', {'Message': 'Something happened'}) carla_process.kill()