def start_carla_simulator(gpu, town_name, docker): """ Start a CARLA simulator, either by running a docker image or by running the binary directly. For that, the CARLA_PATH environment variable should be specified. Args: gpu: the gpu number to run carla town_name: The town name docker: the docker name, if used. If not used docker should be None. Returns: """ port = find_free_port() sp = subprocess.Popen(['docker', 'run', '--rm', '-d', '-p', str(port)+'-'+str(port+2)+':'+str(port)+'-'+str(port+2), '--runtime=nvidia', '-e', 'NVIDIA_VISIBLE_DEVICES='+str(gpu), docker, '/bin/bash', 'CarlaUE4.sh', '/Game/Maps/' + town_name, '-windowed', '-benchmark', '-fps=10', '-world-port=' + str(port)], shell=False, stdout=subprocess.PIPE) (out, err) = sp.communicate() print("Going to communicate") coil_logger.add_message('Loading', {'CARLA': '/CarlaUE4/Binaries/Linux/CarlaUE4' '-windowed'+ '-benchmark'+ '-fps=10'+ '-world-port='+ str(port)}) return sp, port, out
def partition_keys_by_percentiles(steerings, keys, percentiles): iter_index = 0 quad_pos = 0 splited_keys = [] quad_vec = [percentiles[0]] for i in range(1, len(percentiles)): quad_vec.append(quad_vec[-1] + percentiles[i]) for i in range(0, len(steerings)): if i >= quad_vec[quad_pos] * len(steerings) - 1: # We split splited_keys.append(keys[iter_index:i]) if keys[iter_index:i] == []: raise RuntimeError("Reach into an empty bin.") iter_index = i quad_pos += 1 # THe value of steering splitted # The number of keys for this split # print ([steerings[i], len(splited_keys)]) coil_logger.add_message( 'Loading', {'SplitPoints': [steerings[i], len(splited_keys)]}) return splited_keys
def control_steer_split(measurements, meta_data): steerings = measurements[0, :] # TODO: read meta data and turn into a coool dictionary ? #print(np.where(dataset.meta_data[:, 0] == 'control')) labels = measurements[np.where(meta_data[:, 0] == 'control'), :] print(np.unique(labels)) keys = range(0, len(steerings)) print(labels) print(keys) splitted_labels = label_split(labels[0][0], keys, g_conf.LABELS_DIVISION) # Another level of splitting splitted_steer_labels = [] for keys in splitted_labels: splitter_steer = float_split(steerings, keys, g_conf.STEERING_DIVISION) splitted_steer_labels.append(splitter_steer) coil_logger.add_message('Loading', {'KeysDivision': splitted_steer_labels}) return splitted_steer_labels
def set_type_of_process(process_type, param=None): """ This function is used to set which is the type of the current process, test, train or val and also the details of each since there could be many vals and tests for a single experiment. NOTE: AFTER CALLING THIS FUNCTION, THE CONFIGURATION CLOSES Args: type: Returns: """ if _g_conf.PROCESS_NAME == "default": raise RuntimeError( " You should merge with some exp file before setting the type") if process_type == 'train': _g_conf.PROCESS_NAME = process_type elif process_type == "validation": _g_conf.PROCESS_NAME = process_type + '_' + param if process_type == "drive": # FOR drive param is city name. _g_conf.CITY_NAME = param.split('_')[-1] _g_conf.PROCESS_NAME = process_type + '_' + param #else: # FOr the test case we join with the name of the experimental suite. create_log(_g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, _g_conf.PROCESS_NAME, _g_conf.LOG_SCALAR_WRITING_FREQUENCY, _g_conf.LOG_IMAGE_WRITING_FREQUENCY) if process_type == "train": if not os.path.exists( os.path.join('_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, 'checkpoints')): os.mkdir( os.path.join('_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, 'checkpoints')) if process_type == "validation" or process_type == 'drive': if not os.path.exists( os.path.join('_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, _g_conf.PROCESS_NAME + '_csv')): os.mkdir( os.path.join('_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, _g_conf.PROCESS_NAME + '_csv')) # TODO: check if there is some integrity. add_message( 'Loading', { 'ProcessName': _g_conf.EXPERIMENT_GENERATED_NAME, 'FullConfiguration': _g_conf.TRAIN_DATASET_NAME + 'dict' }) _g_conf.immutable(True)
def set_type_of_process(process_type, param=None): """ This function is used to set which is the type of the current process, test, train or val and also the details of each since there could be many vals and tests for a single experiment. NOTE: AFTER CALLING THIS FUNCTION, THE CONFIGURATION CLOSES Args: type: Returns: """ if _g_conf.PROCESS_NAME == "default": raise RuntimeError(" You should merge with some exp file before setting the type") if process_type == 'train': _g_conf.PROCESS_NAME = process_type elif process_type == "validation": _g_conf.PROCESS_NAME = process_type + '_' + param if process_type == "drive": # FOR drive param is city name. _g_conf.CITY_NAME = param _g_conf.PROCESS_NAME = process_type + '_' + _g_conf.CITY_NAME + '_' + _g_conf.EXPERIMENTAL_SUITE_NAME #else: # FOr the test case we join with the name of the experimental suite. create_log(_g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, _g_conf.PROCESS_NAME) if process_type == "train": if not os.path.exists(os.path.join('/datatmp/Experiments/rohitgan/_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, 'checkpoints') ): os.mkdir(os.path.join('/datatmp/Experiments/rohitgan/_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, 'checkpoints')) if process_type == "validation" or process_type == 'drive': if not os.path.exists(os.path.join('_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, _g_conf.PROCESS_NAME + '_csv')): os.mkdir(os.path.join('_logs', _g_conf.EXPERIMENT_BATCH_NAME, _g_conf.EXPERIMENT_NAME, _g_conf.PROCESS_NAME + '_csv')) # We assure ourselves that the configuration file added does not kill things _check_integrity() add_message('Loading', {'ProcessName': generate_name(), 'FullConfiguration': generate_param_dict()}) _g_conf.immutable(True)
def control_steer_split(float_data, meta_data): steerings = float_data[np.where(meta_data[:, 0] == 'steer'), :][0][ 0] # TODO: WHY EVERY WHERE MAKE THIS TO BE USED ?? print("steer shape", steerings.shape) # TODO: read meta data and turn into a coool dictionary ? #print(np.where(dataset.meta_data[:, 0] == 'control')) #TODO ELIMINATE ALL NAMES CALLED LABEL OR MEASUREMENTS , MORE GENERIC FLOAT DATA AND SENSOR DATA IS BETTER labels = float_data[np.where(meta_data[:, 0] == 'control'), :][0][0] print("labels shape ", labels.shape) keys = range(0, len(steerings) - g_conf.NUMBER_IMAGES_SEQUENCE) splitted_labels = label_split(labels, keys, g_conf.LABELS_DIVISION) # Another level of splitting splitted_steer_labels = [] for keys in splitted_labels: splitter_steer = float_split(steerings, keys, g_conf.STEERING_DIVISION) splitted_steer_labels.append(splitter_steer) coil_logger.add_message('Loading', {'KeysDivision': splitted_steer_labels}) return splitted_steer_labels
def __call__(self, tensor, output_tensor): coil_logger.add_message( 'Loss', { "Iteration": 765, "LossValue": [0.232, 0.232, 0.332, 0.2322, 0.232, 0.232, 0.232] }) return tensor
def run_step(self, measurements, sensor_data, directions, target): """ Run a step on the benchmark simulation Args: measurements: All the float measurements from CARLA ( Just speed is used) sensor_data: All the sensor data used on this benchmark directions: The directions, high level commands target: Final objective. Not used when the agent is predicting all outputs. Returns: Controls for the vehicle on the CARLA simulator. """ # Get speed and high-level turning command # Take the forward speed and normalize it for it to go from 0-1 norm_speed = measurements.player_measurements.forward_speed / g_conf.SPEED_FACTOR norm_speed = torch.cuda.FloatTensor([norm_speed]).unsqueeze(0) directions_tensor = torch.cuda.LongTensor([directions]) # If we're evaluating squeeze network (so we are using ground truth seg mask) if "seg" in g_conf.SENSORS.keys(): # Run the autopilot agent to get stop intentions _, state = self.control_agent.run_step(measurements, [], [], target) inputs_vec = [] for input_name in g_conf.INTENTIONS: inputs_vec.append(float(state[input_name])) intentions = torch.cuda.FloatTensor(inputs_vec).unsqueeze(0) # Run squeeze network model_outputs = self._model.forward_branch(self._process_sensors(sensor_data), norm_speed, directions_tensor, intentions, benchmark=True) else: # Run driving model model_outputs = self._model.forward_branch(self._process_sensors(sensor_data), norm_speed, directions_tensor, benchmark=True) steer, throttle, brake = self._process_model_outputs(model_outputs[0]) if self._carla_version == '0.9': import carla control = carla.VehicleControl() else: control = VehicleControl() control.steer = float(steer) control.throttle = float(throttle) control.brake = float(brake) # There is the posibility to replace some of the predictions with oracle predictions. if g_conf.USE_ORACLE: _, control.throttle, control.brake = self._get_oracle_prediction( measurements, target) if self.first_iter: coil_logger.add_message('Iterating', {"Checkpoint": self.checkpoint['iteration'], 'Agent': str(steer)}, self.checkpoint['iteration']) self.first_iter = False return control
def run_step(self, measurements, sensor_data, directions, target): """ Run a step on the benchmark simulation Args: measurements: All the float measurements from CARLA ( Just speed is used) sensor_data: All the sensor data used on this benchmark directions: The directions, high level commands target: Final objective. Not used when the agent is predicting all outputs. Returns: Controls for the vehicle on the CARLA simulator. """ # Take the forward speed and normalize it for it to go from 0-1 norm_speed = measurements.player_measurements.forward_speed / g_conf.SPEED_FACTOR norm_speed = torch.cuda.FloatTensor([norm_speed]).unsqueeze(0) directions_tensor = torch.cuda.LongTensor([directions]) # Compute the forward pass processing the sensors got from CARLA. rgbs = self._process_sensors(sensor_data) with torch.no_grad(): outputs = self.model_erf(rgbs) labels = outputs.max(1)[1].byte().cpu().data seg_road = (labels == 0) seg_not_road = (labels != 0) seg = torch.stack((seg_road, seg_not_road), 1).float() model_outputs = self._model.forward_branch(seg.cuda(), norm_speed, directions_tensor) # model_outputs = self._model.forward_branch(self._process_sensors(sensor_data), norm_speed, # directions_tensor) steer, throttle, brake = self._process_model_outputs(model_outputs[0]) if self._carla_version == '0.9': import carla control = carla.VehicleControl() else: control = VehicleControl() control.steer = float(steer) control.throttle = float(throttle) control.brake = float(brake) # There is the posibility to replace some of the predictions with oracle predictions. if g_conf.USE_ORACLE: _, control.throttle, control.brake = self._get_oracle_prediction( measurements, target) if self.first_iter: coil_logger.add_message('Iterating', { "Checkpoint": self.checkpoint['iteration'], 'Agent': str(steer) }, self.checkpoint['iteration']) self.first_iter = False return control
def load_network(self, checkpoint): """ Load a network for a given model definition . Args: checkpoint: The checkpoint that the user wants to add . """ coil_logger.add_message( 'Loading', {"Model": {"Loaded checkpoint: " + str(checkpoint)}})
def start_carla_simulator(gpu, town_name, no_screen): # Set the outfiles for the process carla_out_file = os.path.join( '_output_logs', 'CARLA_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out") carla_out_file_err = os.path.join( '_output_logs', 'CARLA_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out") # TODO: Add parameters mode = 'VGL' port = find_free_port() carla_path = os.environ['CARLA_PATH'] if no_screen and mode == 'SDL': print(" EXECUTING NO SCREEN! ") os.environ['SDL_VIDEODRIVER'] = 'offscreen' if mode == 'SDL': os.environ['SDL_HINT_CUDA_DEVICE'] = str(gpu) sp = subprocess.Popen([ carla_path + '/CarlaUE4/Binaries/Linux/CarlaUE4', '/Game/Maps/' + town_name, '-windowed', '-benchmark', '-fps=10', '-world-port=' + str(port) ], shell=False, stdout=open(carla_out_file, 'w'), stderr=open(carla_out_file_err, 'w')) elif mode == 'VGL': os.environ['DISPLAY'] = ":5" sp = subprocess.Popen([ 'vglrun', '-d', ':7.' + str(gpu), carla_path + '/CarlaUE4/Binaries/Linux/CarlaUE4', '/Game/Maps/' + town_name, '-windowed', '-benchmark', '-fps=10', '-world-port=' + str(port) ], shell=False, stdout=open(carla_out_file, 'w'), stderr=open(carla_out_file_err, 'w')) else: raise ValueError("Invalid Mode !") coil_logger.add_message( 'Loading', { 'CARLA': carla_path + '/CarlaUE4/Binaries/Linux/CarlaUE4' '-windowed' + '-benchmark' + '-fps=10' + '-world-port=' + str(port) }) return sp, port
def test_global_logger_train(self): # TODO: THERE WILL BE A NAME GENERATOR g_conf.param.NAME = 'experiment_1' g_conf.merge_with_yaml('configs/eccv/experiment_1.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS g_conf.set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 10): coil_logger.add_message('Reading', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }) coil_logger.add_message('Network', { "Iteration": i, "Output-": ["output"] })
def test_check_status_running_iter(self): g_conf = GlobalConfig() g_conf.param.NAME = 'experiment_running_iter' # TODO: this merge is weird. g_conf.merge_with_yaml( 'configs/monitor_test/experiment_running_iter.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS g_conf.set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 10): coil_logger.add_message('Reading', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }) coil_logger.add_message('Model', { "Iteration": i, "Output": ["output"] }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_iter', g_conf.param.PROCESS_NAME) self.assertEqual(status[0], "Iterating")
def start_carla_simulator(gpu, town_name, docker): """ Start a CARLA simulator, either by running a docker image or by running the binary directly. For the latter, the CARLA_PATH environment variable should be specified. Args: gpu: the gpu number to run carla town_name: The town name docker: the docker name, if used. If not used docker should be None. Returns: """ # Temporary config file port = find_free_port() my_env = os.environ.copy() # TODO: add quality level here # sp = subprocess.Popen( # ['docker', 'run', '--rm', '-d', '-p', f'{port}-{port+2}:{port}-{port+2}', '--runtime=nvidia', '-e', # f'NVIDIA_VISIBLE_DEVICES={gpu[0]}', docker, '/bin/bash', 'CarlaUE4.sh', f'/Game/Maps/{town_name}', '-windowed', # '-benchmark', '-fps=20', f'-world-port={port}', '-RenderOffScreen', '--carla-rpc-port=3654', # '--carla-streaming-port=0', '-nosound'], # shell=False, stdout=subprocess.PIPE, env=my_env) sp = subprocess.Popen([ 'docker', 'run', '--rm', '-d', '-p', f'{port}-{port+2}:{port}-{port+2}', '--runtime=nvidia', '-e', '--gpus', f'"device={gpu[0]}"', docker, '/bin/bash', 'CarlaUE4.sh', f'/Game/Maps/{town_name}', '-windowed', '-fps=20', f'-world-port={port}' ], shell=False, stdout=subprocess.PIPE, env=my_env) (out, err) = sp.communicate() print("Going to communicate") coil_logger.add_message( 'Loading', { 'CARLA': f'/CarlaUE4/Binaries/Linux/CarlaUE4-windowed-benchmark-fps=20-world-port={port}' }) return sp, port, out
def start_carla_simulator(gpu, exp_batch, exp_alias, city_name): port = find_free_port() carla_path = os.environ['CARLA_PATH'] #os.environ['SDL_VIDEODRIVER'] = 'offscreen' #os.environ['SDL_HINT_CUDA_DEVICE'] = str(gpu) #subprocess.call() sp = subprocess.Popen([carla_path + '/CarlaUE4/Binaries/Linux/CarlaUE4', '/Game/Maps/' + city_name , '-windowed', '-benchmark', '-fps=10', '-world-port='+str(port)], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) coil_logger.add_message('Loading', {'CARLA': carla_path + '/CarlaUE4/Binaries/Linux/CarlaUE4' + '/Game/Maps/' + city_name + '-windowed'+ '-benchmark'+ '-fps=10'+ '-world-port='+ str(port)}) return sp, port
def test_check_status_error(self): g_conf.immutable(False) # TODO: THe error ? How do nicely merge with the other parts ?? g_conf.NAME = 'experiment_running_error' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_running_error.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 10): coil_logger.add_message('Iterating', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }) coil_logger.add_message('Iterating', { "Iteration": i, "Output": ["output"] }) coil_logger.add_message('Error', { "Iteration": 10, "Message": " Some data integrity problems ! " }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_error.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Error") print(status[1])
def forward(self, x): # get only the speeds from measurement labels # TODO: TRACK NANS OUTPUTS # TODO: Maybe change the name # TODO: Control the frequency of postion log coil_logger.add_message('Model', { 'Perception': { "Output": [1.0, 12.3, 124.29] }, "Iteration": 765 } ) """ conv1 + batch normalization + dropout + relu """ x = self.layers(x) x = x.view(-1, self.num_flat_features(x)) return x
def test_check_status_running_loading(self): g_conf.immutable(False) g_conf.NAME = 'experiment_running_loading' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_running_loading.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_loading.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Loading")
def test_check_status_finished(self): g_conf.immutable(False) g_conf.NAME = 'experiment_finished' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_finished.yaml') g_conf.NUMBER_ITERATIONS = 20 # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') # We set the number of iterations as coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 21): coil_logger.add_message('Iterating', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }, i) coil_logger.add_message('Iterating', { "Iteration": i, "Output": ["output"] }, i) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_finished.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Finished")
def execute(gpu, exp_batch, exp_alias, state_dict, suppress_output=True, number_of_workers=12): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': gpu}) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # Instantiate the class used to read a dataset. The coil dataset generator # can be found dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME) print("Loaded dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) model.cuda() if state_dict != '': seg_model = ERFNet_Fast(2) seg_model = load_my_state_dict(seg_model, torch.load(state_dict)) seg_model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] print("Before the loss") criterion = Loss(g_conf.LOSS_FUNCTION) color_transforms = Colorizes(2) board = Dashboard(8097) # Loss time series window for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ iteration += 1 if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) # get the control commands from float_data, size = [120,1] capture_time = time.time() controls = data['directions'] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() if state_dict != '': with torch.no_grad(): repre = seg_model(torch.squeeze(data['rgb'].cuda()), only_encode=False) inputs = repre imgs = color_transforms(inputs) inputs = inputs.float().cuda() else: inputs = torch.squeeze(data['rgb'].cuda()) # vis board.image( torch.squeeze(data['rgb'])[0].cpu().data, '(train) input iter: ' + str(iteration)) board.image(imgs[0].cpu().data, '(train) output iter: ' + str(iteration)) branches = model(inputs, dataset.extract_inputs(data).cuda()) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), 'controls': controls.cuda(), 'inputs': dataset.extract_inputs(data).cuda(), 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) loss.backward() optimizer.step() """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(data).cuda()) accumulated_time += time.time() - capture_time coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data)[position].data.tolist() }, iteration) loss_window.append(loss.data.tolist()) coil_logger.write_on_error_csv('train', loss.data) print("Iteration: %d Loss: %f" % (iteration, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12, encoder_params=None): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml( os.path.join('configs', exp_batch, exp_alias + '.yaml'), encoder_params) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': os.environ["CUDA_VISIBLE_DEVICES"]}) seed_everything(seed=g_conf.MAGICAL_SEED) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option print(" GOING TO LOAD") if g_conf.PRELOAD_MODEL_ALIAS is not None: print(" LOADING A PRELOAD") checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) else: # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: print('loading previous checkpoint ', checkpoint_file) checkpoint = torch.load( os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 100000000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. #full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # We can save preload dataset depends on the json file name, then no need to load dataset for each time with the same dataset if len(g_conf.EXPERIENCE_FILE) == 1: json_file_name = str( g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2] else: json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split( '/')[-1].split('.')[-2] + '_' + str( g_conf.EXPERIENCE_FILE[1]).split('/')[-1].split('.')[-2] dataset = CoILDataset(transform=augmenter, preload_name=g_conf.PROCESS_NAME + '_' + json_file_name + '_' + g_conf.DATA_USED) #dataset = CoILDataset(transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS)+ 'hours_' + g_conf.TRAIN_DATASET_NAME) print("Loaded Training dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) if g_conf.MODEL_TYPE in ['separate-affordances']: model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.ENCODER_MODEL_CONFIGURATION) model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) print(model) # we use the pre-trained encoder model to extract bottleneck Z and train the E-t-E model if g_conf.MODEL_TYPE in ['separate-affordances']: encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) encoder_model.cuda() encoder_model.eval() # To freeze the pre-trained encoder model if g_conf.FREEZE_ENCODER: for param_ in encoder_model.parameters(): param_.requires_grad = False if encoder_params is not None: encoder_checkpoint = torch.load( os.path.join( '_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints', str(encoder_params['encoder_checkpoint']) + '.pth')) print( "Encoder model ", str(encoder_params['encoder_checkpoint']), "loaded from ", os.path.join('_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints')) encoder_model.load_state_dict(encoder_checkpoint['state_dict']) if g_conf.FREEZE_ENCODER: encoder_model.eval() # To freeze the pre-trained encoder model for param_ in encoder_model.parameters(): param_.requires_grad = False else: optimizer = optim.Adam(list(model.parameters()) + list(encoder_model.parameters()), lr=g_conf.LEARNING_RATE) for name_encoder, param_encoder in encoder_model.named_parameters( ): if param_encoder.requires_grad: print(' Unfrozen layers', name_encoder) else: print(' Frozen layers', name_encoder) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] for name, param in model.named_parameters(): if param.requires_grad: print(' Unfrozen layers', name) else: print(' Frozen layers', name) print("Before the loss") # Loss time series window for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) model.zero_grad() if not g_conf.FREEZE_ENCODER: encoder_model.zero_grad() if g_conf.LABELS_SUPERVISED: inputs_data = torch.cat( (data['rgb'], torch.zeros(g_conf.BATCH_SIZE, 1, 88, 200)), dim=1).cuda() else: inputs_data = torch.squeeze(data['rgb'].cuda()) if g_conf.MODEL_TYPE in ['separate-affordances']: #TODO: for this two encoder models training, we haven't put speed as input to train yet if g_conf.ENCODER_MODEL_TYPE in [ 'action_prediction', 'stdim', 'forward', 'one-step-affordances' ]: e, inter = encoder_model.forward_encoder( inputs_data, dataset.extract_inputs(data).cuda(), # We also add measurements and commands torch.squeeze(dataset.extract_commands(data).cuda())) elif g_conf.ENCODER_MODEL_TYPE in ['ETE']: e, inter = encoder_model.forward_encoder( inputs_data, dataset.extract_inputs(data).cuda(), torch.squeeze(dataset.extract_commands(data).cuda())) loss_function_params = { 'classification_gt': dataset.extract_affordances_targets( data, 'classification').cuda(), # harzard stop, red_light.... 'class_weights': g_conf.AFFORDANCES_CLASS_WEIGHT, 'regression_gt': dataset.extract_affordances_targets(data, 'regression').cuda(), 'variable_weights': g_conf.AFFORDANCES_VARIABLE_WEIGHT } loss = model(e, loss_function_params) loss.backward() optimizer.step() else: raise RuntimeError( 'Not implement yet, this branch is only work for g_conf.MODEL_TYPE in [separate-affordances]' ) """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(iteration) + '.pth')) if not g_conf.FREEZE_ENCODER: encoder_state = { 'iteration': iteration, 'state_dict': encoder_model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( encoder_state, os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(iteration) + '_encoder.pth')) iteration += 1 """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration if iteration % 100 == 0: print('Train Iteration: {} [{}/{} ({:.0f}%)] \t Loss: {:.6f}'. format(iteration, iteration, g_conf.NUMBER_ITERATIONS, 100. * iteration / g_conf.NUMBER_ITERATIONS, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, dataset_name): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = '0' # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) print(full_dataset) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() model.eval() criterion = Loss() latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 latest = 200000 best_loss = 1000.0 best_error = 1000.0 best_loss_iter = 0 best_error_iter = 0 print(dataset.meta_data[0][0]) for k in dataset.meta_data: k[0] = str(k[0], 'utf-8') print(dataset.meta_data[0][0]) cpts = glob.glob( '/home-local/rohitrishabh/coil_20-06/_logs/eccv/experiment_1/checkpoints/*.pth' ) # while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): for ckpt in cpts: # if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) latest = int(ckpt[-10:-4]) # checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias # , 'checkpoints', str(latest) + '.pth')) checkpoint = torch.load(ckpt) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) accumulated_loss = 0.0 accumulated_error = 0.0 iteration_on_checkpoint = 0 for data in data_loader: input_data, float_data = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] # print (torch.squeeze(input_data['rgb']).shape) # print (control_position) # print (speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), float_data[:, speed_position, :].cuda(), float_data[:, control_position, :].cuda()) for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) # TODO: Change this a functional standard using the loss functions. loss = torch.mean( (output - dataset.extract_targets(float_data).cuda())**2).data.tolist() mean_error = torch.mean( torch.abs( output - dataset.extract_targets(float_data).cuda())).data.tolist() accumulated_error += mean_error accumulated_loss += loss error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # Log a random position position = random.randint(0, len(float_data) - 1) #print (output[position].data.tolist()) coil_logger.add_message( 'Iterating in Validation', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'Loss': loss, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(float_data) [position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, latest) iteration_on_checkpoint += 1 checkpoint_average_loss = accumulated_loss / len(dataset) checkpoint_average_error = accumulated_error / len(dataset) coil_logger.add_scalar('Loss', checkpoint_average_loss, latest) coil_logger.add_scalar('Error', checkpoint_average_error, latest) print('Loss: ', checkpoint_average_loss, "----Error: ", checkpoint_average_error) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest state = { 'state_dict': model.state_dict(), 'best_loss': best_loss, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l2' + '.pth')) if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest state = { 'state_dict': model.state_dict(), 'best_error': best_error, 'best_error_iter': best_error_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l1' + '.pth')) print('Best Loss: ', best_loss, "Checkpoint", best_loss_iter) print('Best Error: ', best_error, "Checkpoint", best_error_iter) coil_logger.add_message( 'Iterating in Validation', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_loss, 'BestError': best_error, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest })
def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output): latest = None try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) # The validation dataset is always fully loaded, so we fix a very high number of hours g_conf.NUMBER_OF_HOURS = 10000 set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') if suppress_output: sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=dataset_name) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) # Set ERFnet for segmentation model_erf = ERFNet(20) model_erf = torch.nn.DataParallel(model_erf) model_erf = model_erf.cuda() print("LOAD ERFNet - validate") def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model_erf = load_my_state_dict( model_erf, torch.load(os.path.join('trained_models/erfnet_pretrained.pth'))) model_erf.eval() print("ERFNet and weights LOADED successfully") # The window used to keep track of the trainings l1_window = [] latest = get_latest_evaluated_checkpoint() if latest is not None: # When latest is noe l1_window = coil_logger.recover_loss_window(dataset_name, None) model.cuda() best_mse = 1000 best_error = 1000 best_mse_iter = 0 best_error_iter = 0 while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) model.load_state_dict(checkpoint['state_dict']) model.eval() accumulated_mse = 0 accumulated_error = 0 iteration_on_checkpoint = 0 for data in data_loader: # Compute the forward pass on a batch from the validation dataset controls = data['directions'] # Seg batch rgbs = data['rgb'] with torch.no_grad(): outputs = model_erf(rgbs) labels = outputs.max(1)[1].byte().cpu().data seg_road = (labels == 0) seg_not_road = (labels != 0) seg = torch.stack((seg_road, seg_not_road), 1).float() output = model.forward_branch( torch.squeeze(seg).cuda(), dataset.extract_inputs(data).cuda(), controls) # output = model.foward_branch(torch.squeeze(rgbs).cuda(), # dataset.extract_inputs(data).cuda(),controls) # It could be either waypoints or direct control if 'waypoint1_angle' in g_conf.TARGETS: write_waypoints_output(checkpoint_iteration, output) else: write_regular_output(checkpoint_iteration, output) mse = torch.mean( (output - dataset.extract_targets(data).cuda() )**2).data.tolist() mean_error = torch.mean( torch.abs(output - dataset.extract_targets(data).cuda()) ).data.tolist() accumulated_error += mean_error accumulated_mse += mse error = torch.abs(output - dataset.extract_targets(data).cuda()) # Log a random position position = random.randint(0, len(output.data.tolist()) - 1) coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'MSE': mse, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data) [position].data.tolist() }, latest) iteration_on_checkpoint += 1 print("Iteration %d on Checkpoint %d : Error %f" % (iteration_on_checkpoint, checkpoint_iteration, mean_error)) """ ######## Finish a round of validation, write results, wait for the next ######## """ checkpoint_average_mse = accumulated_mse / (len(data_loader)) checkpoint_average_error = accumulated_error / ( len(data_loader)) coil_logger.add_scalar('Loss', checkpoint_average_mse, latest, True) coil_logger.add_scalar('Error', checkpoint_average_error, latest, True) if checkpoint_average_mse < best_mse: best_mse = checkpoint_average_mse best_mse_iter = latest if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest coil_logger.add_message( 'Iterating', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_mse, 'BestError': best_error, 'BestMSE': best_mse, 'BestMSECheckpoint': best_mse_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest }, latest) l1_window.append(checkpoint_average_error) coil_logger.write_on_error_csv(dataset_name, checkpoint_average_error) # If we are using the finish when validation stops, we check the current if g_conf.FINISH_ON_VALIDATION_STALE is not None: if dlib.count_steps_without_decrease(l1_window) > 3 and \ dlib.count_steps_without_decrease_robust(l1_window) > 3: coil_logger.write_stop(dataset_name, latest) break else: latest = get_latest_evaluated_checkpoint() time.sleep(1) coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'}) print("Waiting for the next Validation") coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def execute(gpu, exp_batch, exp_alias): from time import gmtime, strftime manualSeed = g_conf.SEED torch.cuda.manual_seed(manualSeed) os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) real_dataset = g_conf.TARGET_DOMAIN_PATH #main data loader dataset = CoILDataset(full_dataset, real_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) st = lambda aug: iag.Sometimes(aug, 0.4) oc = lambda aug: iag.Sometimes(aug, 0.3) rl = lambda aug: iag.Sometimes(aug, 0.09) augmenter = iag.Augmenter([iag.ToGPU()] + [ rl(iag.GaussianBlur( (0, 1.5))), # blur images with a sigma between 0 and 1.5 rl(iag.AdditiveGaussianNoise(loc=0, scale=( 0.0, 0.05), per_channel=0.5)), # add gaussian noise to images oc(iag.Dropout((0.0, 0.10), per_channel=0.5) ), # randomly remove up to X% of the pixels oc( iag.CoarseDropout( (0.0, 0.10), size_percent=(0.08, 0.2), per_channel=0.5)), # randomly remove up to X% of the pixels oc(iag.Add((-40, 40), per_channel=0.5) ), # change brightness of images (by -X to Y of original value) st(iag.Multiply((0.10, 2), per_channel=0.2) ), # change brightness of images (X-Y% of original value) rl(iag.ContrastNormalization( (0.5, 1.5), per_channel=0.5)), # improve or worsen the contrast rl(iag.Grayscale((0.0, 1))), # put grayscale ] # do all of the above in random order ) l1weight = g_conf.L1_WEIGHT task_adv_weight = g_conf.TASK_ADV_WEIGHT image_size = tuple([88, 200]) print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) print("GPU", gpu) print("Configurations of ", exp_alias) print("GANMODEL_NAME", g_conf.GANMODEL_NAME) print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION) print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE) print("SKIP", g_conf.SKIP) print("TYPE", g_conf.TYPE) print("L1 WEIGHT", g_conf.L1_WEIGHT) print("TASK ADV WEIGHT", g_conf.TASK_ADV_WEIGHT) print("LAB SMOOTH", g_conf.LABSMOOTH) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION, skip=g_conf.SKIP).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch': netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller': netD = ganmodels_nopatch_smaller._netD( loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch_smaller._netG( loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_task': netD_task = ganmodels_task._netD_task(loss=g_conf.LOSS_FUNCTION).cuda() netD_img = ganmodels_task._netD_img(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda() netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda() if g_conf.PRETRAINED == 'RECON': netF_statedict = torch.load('netF_GAN_Pretrained.wts') netF.load_state_dict(netF_statedict) elif g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth') model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys # if newkey.split('.')[0] == "branch" and oldkey.split('.')[0] == "branches": # print("No Transfer of ", newkey, " to ", oldkey) # else: print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") elif g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': netD_bin = ganmodels_task._netD_task().cuda() netD_img = ganmodels_task._netD_img().cuda() netG = ganmodels_task._netG().cuda() netF = ganmodels_task._netF().cuda() if g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load(g_conf.IL_AGENT_PATH) model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") #### if g_conf.IF_AUG: print("Loading Aug Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) else: print("Loading Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) model_dec_state_dict = model_dec['stateG_dict'] netG_state_dict = netG.state_dict() print(len(netG_state_dict.keys()), len(model_dec_state_dict.keys())) for i, keys in enumerate( zip(netG_state_dict.keys(), model_dec_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netG_state_dict[newkey] = model_dec_state_dict[oldkey] netG.load_state_dict(netG_state_dict) print("Decoder Model Loaded!") init_weights(netD_bin) init_weights(netD_img) # init_weights(netG) print(netD_bin) print(netF) optimD_bin = torch.optim.Adam(netD_bin.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimD_img = torch.optim.Adam(netD_img.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) if g_conf.TYPE == 'task': optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE) Task_Loss = TaskLoss() if g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': print("Using cross entropy!") Loss = torch.nn.CrossEntropyLoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter_F = 0 best_loss_iter_G = 0 best_lossF = 1000000.0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 gen_iterations = 0 n_critic = g_conf.N_CRITIC lossF = Variable(torch.Tensor([100.0])) lossG_adv = Variable(torch.Tensor([100.0])) lossG_smooth = Variable(torch.Tensor([100.0])) lossG = Variable(torch.Tensor([100.0])) netD_bin.train() netD_img.train() netG.train() netF.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) #TODO check how C network is optimized in LSDSEG #TODO put family for losses #IMPORTANT WHILE RUNNING THIS, CONV.PY MUST HAVE BATCHNORMS fake_img_pool_src = ImagePool(50) fake_img_pool_tgt = ImagePool(50) for data in data_loader: set_requires_grad(netD_bin, True) set_requires_grad(netD_img, True) set_requires_grad(netG, True) set_requires_grad(netF, True) # print("ITERATION:", iteration) val = 0.0 input_data, float_data, tgt_imgs = data if g_conf.IF_AUG: inputs = augmenter(0, input_data['rgb']) tgt_imgs = augmenter(0, tgt_imgs) else: inputs = input_data['rgb'].cuda() tgt_imgs = tgt_imgs.cuda() inputs = inputs.squeeze(1) inputs = inputs - val #subtracted by 0.5 tgt_imgs = tgt_imgs - val #subtracted by 0.5 controls = float_data[:, dataset.controls_position(), :] src_embed_inputs, src_branches = netF( inputs, dataset.extract_inputs(float_data).cuda()) tgt_embed_inputs = netF(tgt_imgs, None) src_img_fake = netG(src_embed_inputs) tgt_img_fake = netG(tgt_embed_inputs) if iteration % 100 == 0: imgs_to_save = torch.cat( (inputs[:1] + val, src_img_fake[:1] + val, tgt_imgs[:1] + val, tgt_img_fake[:1] + val), 0).cpu().data coil_logger.add_image("Images", imgs_to_save, iteration) imgs_to_save = imgs_to_save.clamp(0.0, 1.0) vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=False) ##--------------------Discriminator part!!!!!!!!!!-------------------## set_requires_grad(netD_bin, True) set_requires_grad(netD_img, False) set_requires_grad(netG, False) set_requires_grad(netF, False) optimD_bin.zero_grad() outputsD_real_src_bin = netD_bin(src_embed_inputs) outputsD_real_tgt_bin = netD_bin(tgt_embed_inputs) gradient_penalty = calc_gradient_penalty(netD_bin, src_embed_inputs, tgt_embed_inputs) lossD_bin = torch.mean(outputsD_real_tgt_bin - outputsD_real_src_bin) + gradient_penalty lossD_bin.backward(retain_graph=True) optimD_bin.step() coil_logger.add_scalar('Total LossD Bin', lossD_bin.data, iteration) #### Discriminator img update #### set_requires_grad(netD_bin, False) set_requires_grad(netD_img, True) set_requires_grad(netG, False) set_requires_grad(netF, False) optimD_img.zero_grad() outputsD_fake_src_img = netD_img(src_img_fake.detach()) outputsD_fake_tgt_img = netD_img(tgt_img_fake.detach()) outputsD_real_src_img = netD_img(inputs) outputsD_real_tgt_img = netD_img(tgt_imgs) gradient_penalty_src = calc_gradient_penalty(netD_img, inputs, src_img_fake) lossD_img_src = torch.mean( outputsD_fake_src_img - outputsD_real_src_img) + gradient_penalty_src gradient_penalty_tgt = calc_gradient_penalty(netD_img, tgt_imgs, tgt_img_fake) lossD_img_tgt = torch.mean( outputsD_fake_tgt_img - outputsD_real_tgt_img) + gradient_penalty_tgt lossD_img = (lossD_img_src + lossD_img_tgt) * 0.5 lossD_img.backward(retain_graph=True) optimD_img.step() coil_logger.add_scalar('Total LossD img', lossD_img.data, iteration) if ((iteration + 1) % n_critic) == 0: #####Generator updates####### set_requires_grad(netD_bin, False) set_requires_grad(netD_img, False) set_requires_grad(netG, True) set_requires_grad(netF, False) outputsD_fake_src_img = netD_img(src_img_fake) outputsD_real_tgt_img = netD_img(tgt_imgs) outputsD_fake_tgt_img = netD_img(tgt_img_fake) lossG_src_smooth = L1_loss(src_img_fake, inputs) lossG_tgt_smooth = L1_loss(tgt_img_fake, tgt_imgs) lossG_smooth = (lossG_src_smooth + lossG_tgt_smooth) * 0.5 lossG_adv = 0.5 * (-1.0 * outputsD_fake_src_img.mean() - 1.0 * outputsD_fake_tgt_img.mean()) lossG = (lossG_smooth + 0.0 * lossG_adv) lossG.backward(retain_graph=True) optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) #####Task network updates########################## set_requires_grad(netD_bin, False) set_requires_grad(netD_img, False) set_requires_grad(netG, False) set_requires_grad(netF, True) optimF.zero_grad() src_embed_inputs, src_branches = netF( inputs, dataset.extract_inputs(float_data).cuda()) tgt_embed_inputs = netF(tgt_imgs, None) src_img_fake = netG(src_embed_inputs) tgt_img_fake = netG(tgt_embed_inputs) outputsD_fake_src_img = netD_img(src_img_fake) outputsD_real_tgt_img = netD_img(tgt_imgs) lossF_task = Task_Loss.MSELoss( src_branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) lossF_adv_bin = netD_bin(src_embed_inputs).mean() - netD_bin( tgt_embed_inputs).mean() lossF_adv_img = outputsD_fake_src_img.mean( ) - outputsD_real_tgt_img.mean() lossF_adv = 0.5 * (lossF_adv_bin + 0.1 * lossF_adv_img) lossF = (lossF_task + task_adv_weight * lossF_adv) coil_logger.add_scalar('Total Task Loss', lossF.data, iteration) coil_logger.add_scalar('Adv Task Loss', lossF_adv.data, iteration) coil_logger.add_scalar('Only Task Loss', lossF_task.data, iteration) lossF.backward(retain_graph=True) optimF.step() if lossF.data < best_lossF: best_lossF = lossF.data.tolist() best_loss_iter_F = iteration #optimization for one iter done! position = random.randint(0, len(float_data) - 1) accumulated_time += time.time() - capture_time capture_time = time.time() if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_bin_dict': netD_bin.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter_F and iteration > 10000: state = { 'iteration': iteration, 'stateD_bin_dict': netD_bin.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossF': best_lossF, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'best_modelF' + '.pth')) iteration += 1
def run_step(self, input_data): # Get the current directions for following the route directions = self._get_current_direction(self._vehicle_pos) logging.debug(" Current direction %f ", directions) # Take the forward speed and normalize it for it to go from 0-1 network_input = input_data['can_bus'][1]['speed'] / g_conf.SPEED_FACTOR network_input = torch.cuda.FloatTensor([network_input]).unsqueeze(0) # TODO remove ifs #if 'scenario' in g_conf.MEASUREMENTS_INPUTS: # network_input = torch.cat((torch.cuda.FloatTensor([input_data['scenario']]), # network_input), 1) # Compute the forward pass processing the sensors got from CARLA. # TODO we start with an if but we can build a class hierarquical ! if g_conf.MODEL_TYPE in [ 'coil-icra', 'coil-icra-KLD', 'separate-supervised' ]: directions_tensor = torch.cuda.LongTensor([directions]) #print(" Directions ", int(directions)) if False: save_path = os.path.join('temp', 'ete_baseline') if not os.path.exists(save_path): os.mkdir(save_path) save_image( input_data['sensor_input'], os.path.join( save_path, 'run_input_' + str(self.count).zfill(5) + ".png")) self.count += 1 model_outputs = self._model.forward_branch( input_data['sensor_input'], network_input, directions_tensor) elif g_conf.MODEL_TYPE in ['coil-icra-VAE']: directions_tensor = torch.cuda.LongTensor([directions]) if g_conf.ENCODER_MODEL_TYPE in ['VAE']: if g_conf.LABELS_SUPERVISED: input = torch.cat( (input_data['sensor_input'], torch.zeros( 1, 1, 88, 200).cuda()), dim=1) recon_x, mu, _, z = self.encoder_model(input) else: recon_x, mu, _, z = self.encoder_model( input_data['sensor_input']) elif g_conf.ENCODER_MODEL_TYPE in ['Affordances']: mu, _ = self.encoder_model(input_data['sensor_input']) if False: save_path = os.path.join('temp', 'affordances_upperbound') if not os.path.exists(save_path): os.mkdir(save_path) if g_conf.LABELS_SUPERVISED: save_image( input_data['sensor_input'], os.path.join( save_path, 'run_input_' + str(self.count).zfill(5) + ".png")) split = torch.split(torch.squeeze(recon_x, dim=1), [3, 1], dim=1) save_image( split[0], os.path.join( save_path, 'run_recon_rgb_' + str(self.count).zfill(5) + ".png")) save_image( split[1], os.path.join( save_path, 'run_recon_labels_' + str(self.count).zfill(5) + ".png")) else: save_image( input_data['sensor_input'], os.path.join( save_path, 'run_input_' + str(self.count).zfill(5) + ".png")) #save_image(recon_x, os.path.join(save_path, 'run_recon_' + str(self.count).zfill(5) + ".png")) self.count += 1 model_outputs = self._model.forward_branch(mu, network_input, directions_tensor) #print(' frame', self.count) #print(' direction', directions_tensor) #print(' branch output', model_outputs) elif g_conf.MODEL_TYPE in [ 'separate-supervised-NoSpeed', 'coil-icra-NoSpeed' ]: directions_tensor = torch.cuda.LongTensor([directions]) if False: save_path = os.path.join('temp', 'ETE_resnet34_6') if not os.path.exists(save_path): os.mkdir(save_path) save_image( input_data['sensor_input'], os.path.join( save_path, 'run_input_' + str(self.count).zfill(5) + ".png")) self.count += 1 model_outputs = self._model.forward_branch( input_data['sensor_input'], directions_tensor) else: directions_tensor = torch.cuda.FloatTensor( encode_directions(directions)) model_outputs = self._model.forward( self._process_sensors(input_data['rgb'][1]), network_input, directions_tensor)[0] steer, throttle, brake = self._process_model_outputs(model_outputs[0]) control = carla.VehicleControl() control.steer = float(steer) control.throttle = float(throttle) control.brake = float(brake) logging.debug("Output %f %f %f " % (control.steer, control.throttle, control.brake)) if self.first_iter: coil_logger.add_message('Iterating', { "Checkpoint": self.checkpoint['iteration'], 'Agent': str(steer) }, self.checkpoint['iteration']) # There is the posibility to replace some of the predictions with oracle predictions. self.first_iter = False #print(['steer: ', control.steer, 'throttle: ', control.throttle, 'brake: ', control.brake]) return control
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 # print("helllooooo", g_conf.MODEL_NAME) if g_conf.GANMODEL_NAME == 'LSDcontrol': # netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() # else: # netD = ganmodels._oldnetD().cuda() # netG = ganmodels._oldnetG().cuda() # init_weights(netD) init_weights(netG) # print(netD) print(netG) # optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.7, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.eval() # netD.eval() capture_time = time.time() for data in data_loader: val = 0.5 input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) inputs_in = inputs - val #forward pass # print(inputs[0][0][0][0], inputs_in[0][0][0][0]) fake_inputs = netG(inputs_in) #subtracted by 0.5 fake_inputs_in = fake_inputs # print(fake_inputs[0][0][0][0], fake_inputs_in[0][0][0][0]) if iteration % 200 == 0: imgs_to_save = torch.cat((inputs_in[:2] + val, fake_inputs_in[:2]), 0).cpu().data vutils.save_image(imgs_to_save, './noganimgs/' + str(iteration) + 'noganreal_samples.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration) optimG.zero_grad() print("~~~~~~~~~__________") print(inputs_in[0][0][0][0]) print(fake_inputs[0][0][0][0]) lossG_mse = MSE_loss(fake_inputs, inputs) print(lossG_mse) lossG_mse /= len(inputs_in) print("~~~~~~~~~__________--------------") lossG_mse.backward() #retain_graph=True needed? optimG.step() coil_logger.add_scalar('MSE LossG', lossG_mse.data / len(inputs_in), iteration) #optimization for one iter done! position = random.randint(0, len(float_data) - 1) # if lossD.data < best_lossD: # best_lossD = lossD.data.tolist() if lossG_mse.data < best_lossG: best_lossG = lossG_mse.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() # print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) # coil_logger.add_message('Iterating', # {'Iteration': iteration, # 'LossD': lossD.data.tolist(), # 'LossG': lossG.data.tolist(), # 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, # 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, # 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, # 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), # 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, # iteration) # if is_ready_to_save(iteration): # # state = { # 'iteration': iteration, # 'stateD_dict': netD.state_dict(), # 'stateG_dict': netG.state_dict(), # 'best_lossD': best_lossD, # 'best_lossG': best_lossG, # 'total_time': accumulated_time, # 'best_loss_iter': best_loss_iter # # } # torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias # , 'checkpoints', str(iteration) + '.pth')) # if iteration == best_loss_iter: # # state = { # 'iteration': iteration, # 'stateD_dict': netD.state_dict(), # 'stateG_dict': netG.state_dict(), # 'best_lossD': best_lossD, # 'best_lossG': best_lossG, # 'total_time': accumulated_time, # 'best_loss_iter': best_loss_iter # # } # torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias # , 'best_modelG' + '.pth')) # iteration += 1
def execute(gpu, exp_batch, exp_alias, validation_dataset, suppress_output): latest = None try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, f'{exp_alias}.yaml')) # The validation dataset is always fully loaded, so we fix a very high number of hours g_conf.NUMBER_OF_HOURS = 10000 set_type_of_process(process_type='validation', param=validation_dataset) # Save the output to a file if so desired if suppress_output: save_output(exp_alias) # Define the dataset. This structure has the __get_item__ redefined in a way # that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], validation_dataset) augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=validation_dataset, process_type='validation') # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.SENSORS).cuda() # The window used to keep track of the trainings l1_window = [] latest = get_latest_evaluated_checkpoint() if latest is not None: # When latest is noe l1_window = coil_logger.recover_loss_window( validation_dataset, None) # Keep track of the best loss and the iteration where it happens best_loss = 1000 best_loss_iter = 0 print(20 * '#') print('Starting validation!') print(20 * '#') # Check if the maximum checkpoint for validating has been reached while not maximum_checkpoint_reached(latest): # Wait until the next checkpoint is ready (assuming this is run whilst training the model) if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # Get next checkpoint for validation according to the test schedule and load it latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', f'{latest}.pth')) checkpoint_iteration = checkpoint['iteration'] model.load_state_dict(checkpoint['state_dict']) model.eval() # Turn off dropout and batchnorm (if any) print(f"Validation loaded, checkpoint {checkpoint_iteration}") # Main metric will be the used loss for training the network criterion = Loss(g_conf.LOSS_FUNCTION) checkpoint_average_loss = 0 # Counter iteration_on_checkpoint = 0 with torch.no_grad(): # save some computation/memory for data in data_loader: # Compute the forward pass on a batch from the validation dataset controls = data['directions'].cuda() img = torch.squeeze(data['rgb']).cuda() speed = dataset.extract_inputs( data).cuda() # this might not always be speed # For auxiliary metrics output = model.forward_branch(img, speed, controls) # For the loss function branches = model(img, speed) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), 'controls': controls, 'inputs': speed, 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } # It could be either waypoints or direct control if 'waypoint1_angle' in g_conf.TARGETS: write_waypoints_output(checkpoint_iteration, output) else: write_regular_output(checkpoint_iteration, output) loss, _ = criterion(loss_function_params) loss = loss.data.tolist() # Log a random position position = random.randint( 0, len(output.data.tolist()) - 1) coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': f'{iteration_on_checkpoint * g_conf.BATCH_SIZE}/{len(dataset)}', f'Validation Loss ({g_conf.LOSS_FUNCTION})': loss, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(data) [position].data.tolist() }, latest) # We get the average with a growing list of values # Thanks to John D. Cook: http://www.johndcook.com/blog/standard_deviation/ iteration_on_checkpoint += 1 checkpoint_average_loss += ( loss - checkpoint_average_loss) / iteration_on_checkpoint print( f"\rProgress: {100 * iteration_on_checkpoint * g_conf.BATCH_SIZE / len(dataset):3.4f}% - " f"Average Loss ({g_conf.LOSS_FUNCTION}): {checkpoint_average_loss:.16f}", end='') """ ######## Finish a round of validation, write results, wait for the next ######## """ coil_logger.add_scalar( f'Validation Loss ({g_conf.LOSS_FUNCTION})', checkpoint_average_loss, latest, True) # Let's visualize the distribution of the loss coil_logger.add_histogram( f'Validation Checkpoint Loss ({g_conf.LOSS_FUNCTION})', checkpoint_average_loss, latest) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest coil_logger.add_message( 'Iterating', { 'Summary': { 'Loss': checkpoint_average_loss, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter }, 'Checkpoint': latest }, latest) l1_window.append(checkpoint_average_loss) coil_logger.write_on_error_csv(validation_dataset, checkpoint_average_loss, latest) # If we are using the finish when validation stops, we check the current checkpoint if g_conf.FINISH_ON_VALIDATION_STALE is not None: if dlib.count_steps_without_decrease(l1_window) > 3 and \ dlib.count_steps_without_decrease_robust(l1_window) > 3: coil_logger.write_stop(validation_dataset, latest) break else: latest = get_latest_evaluated_checkpoint() time.sleep(1) coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'}) print("Waiting for the next Validation") print('\n' + 20 * '#') print('Finished validation!') print(20 * '#') coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def pre_load_hdf5_files(self, path_for_files): """ Function to load all hdfiles from a certain folder TODO: Add partially loading of the data Returns TODO: IMPROVE A list with the read sensor data ( h5py) All the measurement data """ # Take the names of all measurements from the dataset meas_names = list(g_conf.MEASUREMENTS.keys()) # take the names for all sensors sensors_names = list(g_conf.SENSORS.keys()) # From the determined path take all the possible file names. # TODO: Add more flexibility for the file base names ?? folder_file_names = [ os.path.join(path_for_files, f) for f in glob.glob1(path_for_files, "data_*.h5") ] # Concatenate all the sensor names and measurements names # TODO: This structure is very ugly. meas_data_cat = [list([]) for _ in range(len(meas_names))] sensors_data_cat = [list([]) for _ in range(len(sensors_names))] # We open one dataset to get the metadata for targets # that is important to be able to reference variables in a more legible way dataset = h5py.File(folder_file_names[0], "r") metadata_targets = np.array(dataset['metadata_' + meas_names[0]]) lastidx = 0 count = 0 # TODO: More logs to be added ?? coil_logger.add_message( 'Loading', { 'FilesLoaded': folder_file_names, 'NumberOfImages': len(folder_file_names) }) for file_name in folder_file_names: try: dataset = h5py.File(file_name, "r") for i in range(len(sensors_names)): x = dataset[sensors_names[i]] old_shape = x.shape[0] # Concatenate all the datasets for a given sensor. sensors_data_cat[i].append( (lastidx, lastidx + x.shape[0], x)) for i in range(len(meas_names)): dset_to_append = dataset[meas_names[i]] meas_data_cat[i].append(dset_to_append[:]) lastidx += old_shape dataset.flush() count += 1 except IOError: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exc() traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) print("failed to open", file_name) # TODO: ADD THE STEERING MULTIPLE CAMERA AUGMENTATION # For the number of datasets names that are going to be used for measurements cat all. for i in range(len(meas_names)): meas_data_cat[i] = np.concatenate(meas_data_cat[i], axis=0) meas_data_cat[i] = meas_data_cat[i].transpose((1, 0)) return sensors_data_cat, meas_data_cat[0], metadata_targets
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12): """ The main encoder training function. Args: gpu: The GPU id number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train_encoder') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': os.environ["CUDA_VISIBLE_DEVICES"]}) # we set a seed for this exp seed_everything(seed=g_conf.MAGICAL_SEED) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join('_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load(os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 1000000000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. # full_dataset = os.path.join(os.environ["SRL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) if len(g_conf.EXPERIENCE_FILE) == 1: json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2] else: json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2] + '_' + str(g_conf.EXPERIENCE_FILE[1]).split('/')[-1].split('.')[-2] dataset = CoILDataset(transform=augmenter, preload_name=g_conf.PROCESS_NAME + '_' + json_file_name + '_' + g_conf.DATA_USED) print ("Loaded dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) encoder_model.cuda() encoder_model.train() print(encoder_model) optimizer = optim.Adam(encoder_model.parameters(), lr=g_conf.LEARNING_RATE) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: encoder_model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] print ("Before the loss") if g_conf.ENCODER_MODEL_TYPE in ['ETE']: criterion = Loss(g_conf.LOSS_FUNCTION) # Loss time series window for data in data_loader: if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) capture_time = time.time() encoder_model.zero_grad() """ #################################### ENCODER_MODEL_TYPE can be: one-step-affordances, ETE, stdim, action_prediction #################################### - one-step-affordances: input RGB images, compute affordances loss. - ETE: input RGB images and speed, compute action loss (steering, throttle, brake) - stdim: input two consecutive RGB images, compute the feature loss - action_prediction: input two consecutive RGB images, compute action classification loss - forward: input two consecutive RGB images, compute action loss + feature loss """ if g_conf.ENCODER_MODEL_TYPE in ['one-step-affordances']: loss_function_params = { 'classification_gt': dataset.extract_affordances_targets(data, 'classification').cuda(), # harzard stop, red_light.... 'class_weights': g_conf.AFFORDANCES_CLASS_WEIGHT, 'regression_gt': dataset.extract_affordances_targets(data, 'regression').cuda(), 'variable_weights': g_conf.AFFORDANCES_VARIABLE_WEIGHT } # we input RGB images, speed and command to train affordances loss = encoder_model(torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda(), torch.squeeze(dataset.extract_commands(data).cuda()), loss_function_params) if iteration == 0: state = { 'iteration': iteration, 'state_dict': encoder_model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', 'inital.pth')) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['forward']: # We sample another batch to avoid the superposition inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()] loss, loss_other, loss_ete = encoder_model(inputs_data, dataset.extract_inputs(data), # We also add measurements and commands dataset.extract_commands(data), dataset.extract_targets(data)[0].cuda() ) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['ETE']: branches = encoder_model(torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda(), torch.squeeze(dataset.extract_commands(data).cuda())) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), # steer, throttle, brake 'inputs': dataset.extract_inputs(data).cuda(), # speed 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['stdim']: inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()] loss, _, _ = encoder_model(inputs_data, dataset.extract_inputs(data), # We also add measurements and commands dataset.extract_commands(data) ) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['action_prediction']: inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()] loss, _, _ = encoder_model(inputs_data, dataset.extract_inputs(data), # We also add measurements and commands dataset.extract_commands(data), dataset.extract_targets(data)[0].cuda() ) loss.backward() optimizer.step() else: raise ValueError("The encoder model type is not know") """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': encoder_model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) iteration += 1 """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ if g_conf.ENCODER_MODEL_TYPE in ['stdim', 'action_prediction', 'forward']: coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('f_t', torch.squeeze(data['rgb'][0]), iteration) coil_logger.add_image('f_ti', torch.squeeze(data['rgb'][1]), iteration) elif g_conf.ENCODER_MODEL_TYPE in ['one-step-affordances', 'ETE']: coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time coil_logger.add_message('Iterating', {'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter}, iteration) loss_window.append(loss.data.tolist()) coil_logger.write_on_error_csv('train', loss.data) if iteration % 100 == 0: print('Train Iteration: {} [{}/{} ({:.0f}%)] \t Loss: {:.6f}'.format( iteration, iteration, g_conf.NUMBER_ITERATIONS, 100. * iteration / g_conf.NUMBER_ITERATIONS, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToPILImage(), transforms.Resize(128, 128), transforms.ToTensor(), transforms.Normalize([ 0.5, 0.5, 0.5], [ 1.0, 1.0, 1.0])])) sampler = BatchSequenceSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) transform = transforms.Compose([transforms.Resize((88, 200))]) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 # print("helllooooo", g_conf.MODEL_NAME) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() # else: # netD = ganmodels._oldnetD().cuda() # netG = ganmodels._oldnetG().cuda() init_weights(netD) init_weights(netG) print(netD) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.eval() netD.eval() capture_time = time.time() for data in data_loader: input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) print ("Inputs", i) #forward pass fake_inputs = netG(inputs) if iteration % 1000 == 0: coil_logger.add_image("Images", torch.cat((inputs[:3], fake_inputs[:3]), 0), iteration) ##--------------------Discriminator part!!!!!!!!!!----------------------- set_requires_grad(netD, True) optimD.zero_grad() ##fake outputsD_fake_forD = netD(fake_inputs.detach()) labsize = outputsD_fake_forD.size() #Create labels of patchgan style with label smoothing labels_fake = torch.zeros(labsize[0], labsize[1], labsize[2], labsize[3]) #Fake labels label_fake_noise = torch.rand(labels_fake.size()) * 0.5 - 0.25 #Label smoothing labels_fake = labels_fake + label_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake = MSE_loss(outputsD_fake_forD, labels_fake) ##real outputsD_real = netD(inputs) labsize = outputsD_real.size() #Create labels of patchgan style with label smoothing labels_real = torch.ones(labsize[0], labsize[1], labsize[2], labsize[3]) #Real labels label_real_noise = torch.rand(labels_real.size()) * 0.5 - 0.25 #Label smoothing labels_real = labels_real + label_real_noise labels_real = Variable(labels_real).cuda() lossD_real = MSE_loss(outputsD_real, labels_real) #Discriminator updates lossD = (lossD_real + lossD_fake) * 0.5 lossD /= len(inputs) lossD.backward() #retain_graph=True needed? optimD.step() coil_logger.add_scalar('Total LossD', lossD.data, iteration) coil_logger.add_scalar('Real LossD', lossD_real.data / len(inputs), iteration) coil_logger.add_scalar('Fake LossD', lossD_fake.data / len(inputs), iteration) ##--------------------Generator part!!!!!!!!!!----------------------- #TODO change decoder architecture #TODO check norms of gradients later #TODO add auxiliary regression loss for steering set_requires_grad(netD, False) optimG.zero_grad() outputsD_fake_forG = netD(fake_inputs) #Generator updates lossG_adv = MSE_loss(outputsD_fake_forG, labels_real) lossG_smooth = L1_loss(fake_inputs, inputs) lossG = lossG_adv + l1weight * lossG_smooth lossG /= len(inputs) lossG.backward() #retain_graph=True needed? optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) coil_logger.add_scalar('Adv LossG', lossG_adv.data / len(inputs), iteration) coil_logger.add_scalar('Smooth LossG', lossG_smooth.data / len(inputs), iteration) #optimization for one iter done! position = random.randint(0, len(float_data)-1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() if lossG.data < best_lossG: best_lossG = lossG.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) coil_logger.add_message('Iterating', {'Iteration': iteration, 'LossD': lossD.data.tolist(), 'LossG': lossG.data.tolist(), 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, iteration) if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'best_modelG' + '.pth')) iteration += 1