Exemplos de maximun_checkpoint_reach em Python, exemplos de coilutils.checkpoint_schedule.maximun_checkpoint_reach em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: validate.py Projeto: 592McAvoy/coiltraine

def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output):
    latest = None
    try:
        # We set the visible cuda devices
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        # At this point the log file with the correct naming is created.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        # The validation dataset is always fully loaded, so we fix a very high number of hours
        g_conf.NUMBER_OF_HOURS = 10000
        set_type_of_process('validation', dataset_name)

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        if suppress_output:
            sys.stdout = open(os.path.join(
                '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' +
                str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)
            sys.stderr = open(os.path.join(
                '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME +
                '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        # Define the dataset. This structure is has the __get_item__ redefined in a way
        # that you can access the HDFILES positions from the root directory as a in a vector.
        full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                    dataset_name)
        augmenter = Augmenter(None)
        # Definition of the dataset to be used. Preload name is just the validation data name
        dataset = CoILDataset(full_dataset,
                              transform=augmenter,
                              preload_name=dataset_name)

        # Creates the sampler, this part is responsible for managing the keys. It divides
        # all keys depending on the measurements and produces a set of keys for each bach.

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        data_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=g_conf.BATCH_SIZE,
            shuffle=False,
            num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
            pin_memory=True)

        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)

        # Set ERFnet for segmentation
        model_erf = ERFNet(20)
        model_erf = torch.nn.DataParallel(model_erf)
        model_erf = model_erf.cuda()

        print("LOAD ERFNet - validate")

        def load_my_state_dict(
            model, state_dict
        ):  #custom function to load model when not all dict elements
            own_state = model.state_dict()
            for name, param in state_dict.items():
                if name not in own_state:
                    continue
                own_state[name].copy_(param)
            return model

        model_erf = load_my_state_dict(
            model_erf,
            torch.load(os.path.join('trained_models/erfnet_pretrained.pth')))
        model_erf.eval()
        print("ERFNet and weights LOADED successfully")

        # The window used to keep track of the trainings
        l1_window = []
        latest = get_latest_evaluated_checkpoint()
        if latest is not None:  # When latest is noe
            l1_window = coil_logger.recover_loss_window(dataset_name, None)

        model.cuda()

        best_mse = 1000
        best_error = 1000
        best_mse_iter = 0
        best_error_iter = 0

        while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):

            if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE):

                latest = get_next_checkpoint(g_conf.TEST_SCHEDULE)

                checkpoint = torch.load(
                    os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                                 str(latest) + '.pth'))
                checkpoint_iteration = checkpoint['iteration']
                print("Validation loaded ", checkpoint_iteration)

                model.load_state_dict(checkpoint['state_dict'])

                model.eval()
                accumulated_mse = 0
                accumulated_error = 0
                iteration_on_checkpoint = 0
                for data in data_loader:

                    # Compute the forward pass on a batch from  the validation dataset
                    controls = data['directions']

                    # Seg batch
                    rgbs = data['rgb']
                    with torch.no_grad():
                        outputs = model_erf(rgbs)
                    labels = outputs.max(1)[1].byte().cpu().data

                    seg_road = (labels == 0)
                    seg_not_road = (labels != 0)
                    seg = torch.stack((seg_road, seg_not_road), 1).float()

                    output = model.forward_branch(
                        torch.squeeze(seg).cuda(),
                        dataset.extract_inputs(data).cuda(), controls)

                    #                    output = model.foward_branch(torch.squeeze(rgbs).cuda(),
                    #                                                 dataset.extract_inputs(data).cuda(),controls)
                    # It could be either waypoints or direct control
                    if 'waypoint1_angle' in g_conf.TARGETS:
                        write_waypoints_output(checkpoint_iteration, output)
                    else:
                        write_regular_output(checkpoint_iteration, output)

                    mse = torch.mean(
                        (output - dataset.extract_targets(data).cuda()
                         )**2).data.tolist()
                    mean_error = torch.mean(
                        torch.abs(output -
                                  dataset.extract_targets(data).cuda())
                    ).data.tolist()

                    accumulated_error += mean_error
                    accumulated_mse += mse
                    error = torch.abs(output -
                                      dataset.extract_targets(data).cuda())

                    # Log a random position
                    position = random.randint(0, len(output.data.tolist()) - 1)

                    coil_logger.add_message(
                        'Iterating', {
                            'Checkpoint':
                            latest,
                            'Iteration': (str(iteration_on_checkpoint * 120) +
                                          '/' + str(len(dataset))),
                            'MeanError':
                            mean_error,
                            'MSE':
                            mse,
                            'Output':
                            output[position].data.tolist(),
                            'GroundTruth':
                            dataset.extract_targets(
                                data)[position].data.tolist(),
                            'Error':
                            error[position].data.tolist(),
                            'Inputs':
                            dataset.extract_inputs(data)
                            [position].data.tolist()
                        }, latest)
                    iteration_on_checkpoint += 1
                    print("Iteration %d  on Checkpoint %d : Error %f" %
                          (iteration_on_checkpoint, checkpoint_iteration,
                           mean_error))
                """
                    ########
                    Finish a round of validation, write results, wait for the next
                    ########
                """

                checkpoint_average_mse = accumulated_mse / (len(data_loader))
                checkpoint_average_error = accumulated_error / (
                    len(data_loader))
                coil_logger.add_scalar('Loss', checkpoint_average_mse, latest,
                                       True)
                coil_logger.add_scalar('Error', checkpoint_average_error,
                                       latest, True)

                if checkpoint_average_mse < best_mse:
                    best_mse = checkpoint_average_mse
                    best_mse_iter = latest

                if checkpoint_average_error < best_error:
                    best_error = checkpoint_average_error
                    best_error_iter = latest

                coil_logger.add_message(
                    'Iterating', {
                        'Summary': {
                            'Error': checkpoint_average_error,
                            'Loss': checkpoint_average_mse,
                            'BestError': best_error,
                            'BestMSE': best_mse,
                            'BestMSECheckpoint': best_mse_iter,
                            'BestErrorCheckpoint': best_error_iter
                        },
                        'Checkpoint': latest
                    }, latest)

                l1_window.append(checkpoint_average_error)
                coil_logger.write_on_error_csv(dataset_name,
                                               checkpoint_average_error)

                # If we are using the finish when validation stops, we check the current
                if g_conf.FINISH_ON_VALIDATION_STALE is not None:
                    if dlib.count_steps_without_decrease(l1_window) > 3 and \
                            dlib.count_steps_without_decrease_robust(l1_window) > 3:
                        coil_logger.write_stop(dataset_name, latest)
                        break

            else:

                latest = get_latest_evaluated_checkpoint()
                time.sleep(1)

                coil_logger.add_message('Loading',
                                        {'Message': 'Waiting Checkpoint'})
                print("Waiting for the next Validation")

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

    except RuntimeError as e:
        if latest is not None:
            coil_logger.erase_csv(latest)
        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

Exemplo n.º 2

0

Exibir arquivo

def execute(gpu,
            exp_batch,
            exp_alias,
            json_file_path,
            suppress_output,
            encoder_params=None,
            plot_attentions=False):
    try:
        # We set the visible cuda devices
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        if json_file_path is not None:
            json_file_name = json_file_path.split('/')[-1].split('.')[-2]
        else:
            raise RuntimeError(
                "You need to define the validation json file path")

        # At this point the log file with the correct naming is created.
        merge_with_yaml(
            os.path.join('configs', exp_batch, exp_alias + '.yaml'),
            encoder_params)
        if plot_attentions:
            set_type_of_process('validation',
                                json_file_name + '_plotAttention')
        else:
            set_type_of_process('validation', json_file_name)

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        if suppress_output:
            sys.stdout = open(os.path.join(
                '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' +
                str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)
            sys.stderr = open(os.path.join(
                '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME +
                '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        # We create file for saving validation results
        summary_file = os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME,
                                    g_conf.PROCESS_NAME + '_csv',
                                    'valid_summary_1camera.csv')
        g_conf.immutable(False)
        g_conf.DATA_USED = 'central'
        g_conf.immutable(True)
        if not os.path.exists(summary_file):
            csv_outfile = open(summary_file, 'w')
            csv_outfile.write(
                "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" %
                ('step', 'accumulated_pedestrian_TP',
                 'accumulated_pedestrian_FP', 'accumulated_pedestrian_FN',
                 'accumulated_pedestrian_TN', 'accumulated_vehicle_stop_TP',
                 'accumulated_vehicle_stop_FP', 'accumulated_vehicle_stop_FN',
                 'accumulated_vehicle_stop_TN', 'accumulated_red_tl_TP',
                 'accumulated_red_tl_FP', 'accumulated_red_tl_FN',
                 'accumulated_red_tl_TN', 'MAE_relative_angle'))
            csv_outfile.close()

        latest = get_latest_evaluated_checkpoint_2(summary_file)

        # Define the dataset. This structure is has the __get_item__ redefined in a way
        # that you can access the HDFILES positions from the root directory as a in a vector.
        #full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name)
        augmenter = Augmenter(None)
        # Definition of the dataset to be used. Preload name is just the validation data name
        dataset = CoILDataset(transform=augmenter,
                              preload_name=g_conf.PROCESS_NAME + '_' +
                              g_conf.DATA_USED,
                              process_type='validation',
                              vd_json_file_path=json_file_path)
        print("Loaded Validation dataset")

        # Creates the sampler, this part is responsible for managing the keys. It divides
        # all keys depending on the measurements and produces a set of keys for each bach.

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        data_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=g_conf.BATCH_SIZE,
            shuffle=False,
            num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
            pin_memory=True)

        if g_conf.MODEL_TYPE in ['one-step-affordances']:
            # one step training, no need to retrain FC layers, we just get the output of encoder model as prediciton
            model = EncoderModel(g_conf.ENCODER_MODEL_TYPE,
                                 g_conf.ENCODER_MODEL_CONFIGURATION)
            model.cuda()
            #print(model)

        elif g_conf.MODEL_TYPE in ['separate-affordances']:
            model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION,
                              g_conf.ENCODER_MODEL_CONFIGURATION)
            model.cuda()
            #print(model)

            encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE,
                                         g_conf.ENCODER_MODEL_CONFIGURATION)
            encoder_model.cuda()
            encoder_model.eval()

            # Here we load the pre-trained encoder (not fine-tunned)
            if g_conf.FREEZE_ENCODER:
                if encoder_params is not None:
                    encoder_checkpoint = torch.load(
                        os.path.join(
                            '_logs', encoder_params['encoder_folder'],
                            encoder_params['encoder_exp'], 'checkpoints',
                            str(encoder_params['encoder_checkpoint']) +
                            '.pth'))
                    print(
                        "Encoder model ",
                        str(encoder_params['encoder_checkpoint']),
                        "loaded from ",
                        os.path.join('_logs', encoder_params['encoder_folder'],
                                     encoder_params['encoder_exp'],
                                     'checkpoints'))
                    encoder_model.load_state_dict(
                        encoder_checkpoint['state_dict'])
                    encoder_model.eval()
                for param_ in encoder_model.parameters():
                    param_.requires_grad = False

        while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):
            latest = get_next_checkpoint_2(g_conf.TEST_SCHEDULE, summary_file)
            if os.path.exists(
                    os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME,
                                 'checkpoints',
                                 str(latest) + '.pth')):
                checkpoint = torch.load(
                    os.path.join('_logs', exp_batch, g_conf.EXPERIMENT_NAME,
                                 'checkpoints',
                                 str(latest) + '.pth'))
                checkpoint_iteration = checkpoint['iteration']
                model.load_state_dict(checkpoint['state_dict'])
                print("Validation checkpoint ", checkpoint_iteration)
                model.eval()
                for param_ in model.parameters():
                    param_.requires_grad = False

                # Here we load the fine-tunned encoder
                if not g_conf.FREEZE_ENCODER and g_conf.MODEL_TYPE not in [
                        'one-step-affordances'
                ]:
                    encoder_checkpoint = torch.load(
                        os.path.join('_logs', exp_batch,
                                     g_conf.EXPERIMENT_NAME, 'checkpoints',
                                     str(latest) + '_encoder.pth'))
                    print(
                        "FINE TUNNED encoder model ",
                        str(latest) + '_encoder.pth', "loaded from ",
                        os.path.join('_logs', exp_batch,
                                     g_conf.EXPERIMENT_NAME, 'checkpoints'))
                    encoder_model.load_state_dict(
                        encoder_checkpoint['state_dict'])
                    encoder_model.eval()
                    for param_ in encoder_model.parameters():
                        param_.requires_grad = False

                accumulated_mae_ra = 0
                accumulated_pedestrian_TP = 0
                accumulated_pedestrian_TN = 0
                accumulated_pedestrian_FN = 0
                accumulated_pedestrian_FP = 0

                accumulated_red_tl_TP = 0
                accumulated_red_tl_TN = 0
                accumulated_red_tl_FP = 0
                accumulated_red_tl_FN = 0

                accumulated_vehicle_stop_TP = 0
                accumulated_vehicle_stop_TN = 0
                accumulated_vehicle_stop_FP = 0
                accumulated_vehicle_stop_FN = 0

                iteration_on_checkpoint = 0

                for data in data_loader:
                    if g_conf.MODEL_TYPE in ['one-step-affordances']:
                        c_output, r_output, layers = model.forward_outputs(
                            torch.squeeze(data['rgb'].cuda()),
                            dataset.extract_inputs(data).cuda(),
                            dataset.extract_commands(data).cuda())

                    elif g_conf.MODEL_TYPE in ['separate-affordances']:
                        if g_conf.ENCODER_MODEL_TYPE in [
                                'action_prediction', 'stdim', 'ETEDIM',
                                'FIMBC', 'one-step-affordances'
                        ]:
                            e, layers = encoder_model.forward_encoder(
                                torch.squeeze(data['rgb'].cuda()),
                                dataset.extract_inputs(data).cuda(),
                                torch.squeeze(
                                    dataset.extract_commands(data).cuda()))
                            c_output, r_output = model.forward_test(e)

                        elif g_conf.ENCODER_MODEL_TYPE in [
                                'ETE', 'ETE_inverse_model', 'forward',
                                'ETE_stdim'
                        ]:
                            e, layers = encoder_model.forward_encoder(
                                torch.squeeze(data['rgb'].cuda()),
                                dataset.extract_inputs(data).cuda(),
                                torch.squeeze(
                                    dataset.extract_commands(data).cuda()))
                            c_output, r_output = model.forward_test(e)

                    if plot_attentions:
                        attentions_path = os.path.join(
                            '_logs', exp_batch, g_conf.EXPERIMENT_NAME,
                            g_conf.PROCESS_NAME + '_attentions_' + str(latest))

                        write_attentions(torch.squeeze(data['rgb']), layers,
                                         iteration_on_checkpoint,
                                         attentions_path)

                    # Accurancy = (TP+TN)/(TP+TN+FP+FN)
                    # F1-score = 2*TP / (2*TP + FN + FP)
                    classification_gt = dataset.extract_affordances_targets(
                        data, 'classification')
                    regression_gt = dataset.extract_affordances_targets(
                        data, 'regression')

                    TP = 0
                    FN = 0
                    FP = 0
                    TN = 0
                    for i in range(classification_gt.shape[0]):
                        if classification_gt[i, 0] == (
                                c_output[0][i, 0] < c_output[0][i, 1]).type(
                                    torch.FloatTensor) == 1:
                            TP += 1

                        elif classification_gt[
                                i, 0] == 1 and classification_gt[i, 0] != (
                                    c_output[0][i, 0] <
                                    c_output[0][i, 1]).type(torch.FloatTensor):
                            FN += 1

                        elif classification_gt[
                                i, 0] == 0 and classification_gt[i, 0] != (
                                    c_output[0][i, 0] <
                                    c_output[0][i, 1]).type(torch.FloatTensor):
                            FP += 1

                        if classification_gt[i, 0] == (
                                c_output[0][i, 0] < c_output[0][i, 1]).type(
                                    torch.FloatTensor) == 0:
                            TN += 1

                    accumulated_pedestrian_TP += TP
                    accumulated_pedestrian_TN += TN
                    accumulated_pedestrian_FP += FP
                    accumulated_pedestrian_FN += FN

                    TP = 0
                    FN = 0
                    FP = 0
                    TN = 0
                    for i in range(classification_gt.shape[0]):
                        if classification_gt[i, 1] == (
                                c_output[1][i, 0] < c_output[1][i, 1]).type(
                                    torch.FloatTensor) == 1:
                            TP += 1

                        elif classification_gt[
                                i, 1] == 1 and classification_gt[i, 1] != (
                                    c_output[1][i, 0] <
                                    c_output[1][i, 1]).type(torch.FloatTensor):
                            FN += 1

                        elif classification_gt[
                                i, 1] == 0 and classification_gt[i, 1] != (
                                    c_output[1][i, 0] <
                                    c_output[1][i, 1]).type(torch.FloatTensor):
                            FP += 1

                        if classification_gt[i, 1] == (
                                c_output[1][i, 0] < c_output[1][i, 1]).type(
                                    torch.FloatTensor) == 0:
                            TN += 1

                    accumulated_red_tl_TP += TP
                    accumulated_red_tl_TN += TN
                    accumulated_red_tl_FP += FP
                    accumulated_red_tl_FN += FN

                    TP = 0
                    FN = 0
                    FP = 0
                    TN = 0
                    for i in range(classification_gt.shape[0]):
                        if classification_gt[i, 2] == (
                                c_output[2][i, 0] < c_output[2][i, 1]).type(
                                    torch.FloatTensor) == 1:
                            TP += 1

                        elif classification_gt[i, 2] == 1 and classification_gt[i, 2] !=\
                                (c_output[2][i, 0] < c_output[2][i, 1]).type(torch.FloatTensor):
                            FN += 1

                        elif classification_gt[i, 2] == 0 and classification_gt[i, 2] !=\
                                (c_output[2][i, 0] < c_output[2][i, 1]).type(torch.FloatTensor):
                            FP += 1

                        if classification_gt[i, 2] == (
                                c_output[2][i, 0] < c_output[2][i, 1]).type(
                                    torch.FloatTensor) == 0:
                            TN += 1

                    accumulated_vehicle_stop_TP += TP
                    accumulated_vehicle_stop_TN += TN
                    accumulated_vehicle_stop_FP += FP
                    accumulated_vehicle_stop_FN += FN

                    # if the data was normalized during training, we need to transform it to its unit

                    write_regular_output(checkpoint_iteration,
                                         torch.squeeze(r_output[0]),
                                         regression_gt[:, 0])
                    mae_ra = torch.abs(regression_gt[:, 0] -
                                       torch.squeeze(r_output[0]).type(torch.FloatTensor)).\
                                            numpy()
                    accumulated_mae_ra += np.sum(mae_ra)

                    if iteration_on_checkpoint % 100 == 0:
                        print(
                            "Validation iteration: %d [%d/%d)] on Checkpoint %d "
                            %
                            (iteration_on_checkpoint, iteration_on_checkpoint,
                             len(data_loader), checkpoint_iteration))

                    iteration_on_checkpoint += 1

                # Here also need a better analysis. TODO divide into curve and other things
                MAE_relative_angle = accumulated_mae_ra / (len(dataset))

                csv_outfile = open(summary_file, 'a')
                csv_outfile.write(
                    "%s, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f" %
                    (checkpoint_iteration, accumulated_pedestrian_TP,
                     accumulated_pedestrian_FP, accumulated_pedestrian_FN,
                     accumulated_pedestrian_TN, accumulated_vehicle_stop_TP,
                     accumulated_vehicle_stop_FP, accumulated_vehicle_stop_FN,
                     accumulated_vehicle_stop_TN, accumulated_red_tl_TP,
                     accumulated_red_tl_FP, accumulated_red_tl_FN,
                     accumulated_red_tl_TN, MAE_relative_angle))

                csv_outfile.write("\n")
                csv_outfile.close()

            else:
                print('The checkpoint you want to validate is not yet ready ',
                      str(latest))

        coil_logger.add_message('Finished', {})
        print('VALIDATION FINISHED !!')
        print('  Validation results saved in ==> ', summary_file)

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

    except RuntimeError as e:
        if latest is not None:
            coil_logger.erase_csv(latest)
        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: validate.py Projeto: twsq/sam-driving

def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output):
    latest = None
    try:
        # We set the visible cuda devices
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        # At this point the log file with the correct naming is created.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        # The validation dataset is always fully loaded, so we fix a very high number of hours
        g_conf.NUMBER_OF_HOURS = 10000
        set_type_of_process('validation', dataset_name)

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        if suppress_output:
            sys.stdout = open(os.path.join(
                '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' +
                str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)
            sys.stderr = open(os.path.join(
                '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME +
                '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        # Define the dataset.
        full_dataset = [
            os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name)
        ]
        augmenter = Augmenter(None)
        # Definition of the dataset to be used. Preload name is just the validation data name
        dataset = CoILDataset(full_dataset,
                              transform=augmenter,
                              preload_names=[dataset_name])

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        data_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=g_conf.BATCH_SIZE,
            shuffle=False,
            num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
            pin_memory=True)

        # Create model.
        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
        # The window used to keep track of the validation loss
        l1_window = []
        # If we have evaluated a checkpoint, get the validation losses of all the previously
        # evaluated checkpoints (validation loss is used for early stopping)
        latest = get_latest_evaluated_checkpoint()
        if latest is not None:  # When latest is noe
            l1_window = coil_logger.recover_loss_window(dataset_name, None)

        model.cuda()

        best_mse = 1000
        best_error = 1000
        best_mse_iter = 0
        best_error_iter = 0

        # Loop to validate all checkpoints as they are saved during training
        while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):
            if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE):
                with torch.no_grad():
                    # Get and load latest checkpoint
                    latest = get_next_checkpoint(g_conf.TEST_SCHEDULE)

                    checkpoint = torch.load(
                        os.path.join('_logs', exp_batch, exp_alias,
                                     'checkpoints',
                                     str(latest) + '.pth'))
                    checkpoint_iteration = checkpoint['iteration']
                    print("Validation loaded ", checkpoint_iteration)

                    model.load_state_dict(checkpoint['state_dict'])
                    model.eval()

                    accumulated_mse = 0
                    accumulated_error = 0
                    iteration_on_checkpoint = 0
                    if g_conf.USE_REPRESENTATION_LOSS:
                        accumulated_perception_rep_mse = 0
                        accumulated_speed_rep_mse = 0
                        accumulated_intentions_rep_mse = 0
                        accumulated_rep_mse = 0
                        accumulated_perception_rep_error = 0
                        accumulated_speed_rep_error = 0
                        accumulated_intentions_rep_error = 0
                        accumulated_rep_error = 0

                    # Validation loop
                    for data in data_loader:

                        # Compute the forward pass on a batch from  the validation dataset
                        controls = data['directions']

                        # Run model forward and get outputs
                        # First case corresponds to squeeze network, second case corresponds to driving model without
                        # mimicking losses, last case corresponds to mimic network
                        if "seg" in g_conf.SENSORS.keys():
                            output = model.forward_branch(
                                data,
                                dataset.extract_inputs(data).cuda(), controls,
                                dataset.extract_intentions(data).cuda())
                        elif not g_conf.USE_REPRESENTATION_LOSS:
                            output = model.forward_branch(
                                data,
                                dataset.extract_inputs(data).cuda(), controls)
                        else:
                            output, intermediate_reps = model.forward_branch(
                                data,
                                dataset.extract_inputs(data).cuda(), controls)

                        write_regular_output(checkpoint_iteration, output)

                        # Compute control loss on current validation batch and accumulate it
                        targets_to_use = dataset.extract_targets(data)

                        mse = torch.mean(
                            (output - targets_to_use.cuda())**2).data.tolist()
                        mean_error = torch.mean(
                            torch.abs(output -
                                      targets_to_use.cuda())).data.tolist()

                        accumulated_error += mean_error
                        accumulated_mse += mse

                        error = torch.abs(output - targets_to_use.cuda())

                        # Compute mimicking losses on current validation batch and accumulate it
                        if g_conf.USE_REPRESENTATION_LOSS:
                            expert_reps = dataset.extract_representations(data)
                            # First L1 losses (seg mask, speed, intention mimicking losses)
                            if g_conf.USE_PERCEPTION_REP_LOSS:
                                perception_rep_loss = torch.sum(
                                    torch.abs(intermediate_reps[0] -
                                              expert_reps[0].cuda())
                                ).data.tolist() / (3 * output.shape[0])
                            else:
                                perception_rep_loss = 0
                            if g_conf.USE_SPEED_REP_LOSS:
                                speed_rep_loss = torch.sum(
                                    torch.abs(intermediate_reps[1] -
                                              expert_reps[1].cuda())
                                ).data.tolist() / (3 * output.shape[0])
                            else:
                                speed_rep_loss = 0
                            if g_conf.USE_INTENTION_REP_LOSS:
                                intentions_rep_loss = torch.sum(
                                    torch.abs(intermediate_reps[2] -
                                              expert_reps[2].cuda())
                                ).data.tolist() / (3 * output.shape[0])
                            else:
                                intentions_rep_loss = 0
                            rep_error = g_conf.REP_LOSS_WEIGHT * (
                                perception_rep_loss + speed_rep_loss +
                                intentions_rep_loss)
                            accumulated_perception_rep_error += perception_rep_loss
                            accumulated_speed_rep_error += speed_rep_loss
                            accumulated_intentions_rep_error += intentions_rep_loss
                            accumulated_rep_error += rep_error

                            # L2 losses now
                            if g_conf.USE_PERCEPTION_REP_LOSS:
                                perception_rep_loss = torch.sum(
                                    (intermediate_reps[0] -
                                     expert_reps[0].cuda())**
                                    2).data.tolist() / (3 * output.shape[0])
                            else:
                                perception_rep_loss = 0
                            if g_conf.USE_SPEED_REP_LOSS:
                                speed_rep_loss = torch.sum(
                                    (intermediate_reps[1] -
                                     expert_reps[1].cuda())**
                                    2).data.tolist() / (3 * output.shape[0])
                            else:
                                speed_rep_loss = 0
                            if g_conf.USE_INTENTION_REP_LOSS:
                                intentions_rep_loss = torch.sum(
                                    (intermediate_reps[2] -
                                     expert_reps[2].cuda())**
                                    2).data.tolist() / (3 * output.shape[0])
                            else:
                                intentions_rep_loss = 0
                            rep_mse = g_conf.REP_LOSS_WEIGHT * (
                                perception_rep_loss + speed_rep_loss +
                                intentions_rep_loss)
                            accumulated_perception_rep_mse += perception_rep_loss
                            accumulated_speed_rep_mse += speed_rep_loss
                            accumulated_intentions_rep_mse += intentions_rep_loss
                            accumulated_rep_mse += rep_mse

                        # Log a random position
                        position = random.randint(
                            0,
                            len(output.data.tolist()) - 1)

                        # Logging
                        if g_conf.USE_REPRESENTATION_LOSS:
                            total_mse = mse + rep_mse
                            total_error = mean_error + rep_error
                            coil_logger.add_message(
                                'Iterating', {
                                    'Checkpoint':
                                    latest,
                                    'Iteration':
                                    (str(iteration_on_checkpoint * 120) + '/' +
                                     str(len(dataset))),
                                    'MeanError':
                                    mean_error,
                                    'MSE':
                                    mse,
                                    'RepMeanError':
                                    rep_error,
                                    'RepMSE':
                                    rep_mse,
                                    'MeanTotalError':
                                    total_error,
                                    'TotalMSE':
                                    total_mse,
                                    'Output':
                                    output[position].data.tolist(),
                                    'GroundTruth':
                                    targets_to_use[position].data.tolist(),
                                    'Error':
                                    error[position].data.tolist(),
                                    'Inputs':
                                    dataset.extract_inputs(
                                        data)[position].data.tolist()
                                }, latest)
                        else:
                            coil_logger.add_message(
                                'Iterating', {
                                    'Checkpoint':
                                    latest,
                                    'Iteration':
                                    (str(iteration_on_checkpoint * 120) + '/' +
                                     str(len(dataset))),
                                    'MeanError':
                                    mean_error,
                                    'MSE':
                                    mse,
                                    'Output':
                                    output[position].data.tolist(),
                                    'GroundTruth':
                                    targets_to_use[position].data.tolist(),
                                    'Error':
                                    error[position].data.tolist(),
                                    'Inputs':
                                    dataset.extract_inputs(
                                        data)[position].data.tolist()
                                }, latest)
                        iteration_on_checkpoint += 1

                        if g_conf.USE_REPRESENTATION_LOSS:
                            print("Iteration %d  on Checkpoint %d : Error %f" %
                                  (iteration_on_checkpoint,
                                   checkpoint_iteration, total_error))
                        else:
                            print("Iteration %d  on Checkpoint %d : Error %f" %
                                  (iteration_on_checkpoint,
                                   checkpoint_iteration, mean_error))
                    """
                        ########
                        Finish a round of validation, write results, wait for the next
                        ########
                    """
                    # Compute average L1 and L2 losses over whole round of validation and log them
                    checkpoint_average_mse = accumulated_mse / (
                        len(data_loader))
                    checkpoint_average_error = accumulated_error / (
                        len(data_loader))
                    coil_logger.add_scalar('L2 Loss', checkpoint_average_mse,
                                           latest, True)
                    coil_logger.add_scalar('Loss', checkpoint_average_error,
                                           latest, True)

                    if g_conf.USE_REPRESENTATION_LOSS:
                        checkpoint_average_perception_rep_mse = accumulated_perception_rep_mse / (
                            len(data_loader))
                        checkpoint_average_speed_rep_mse = accumulated_speed_rep_mse / (
                            len(data_loader))
                        checkpoint_average_intentions_rep_mse = accumulated_intentions_rep_mse / (
                            len(data_loader))
                        checkpoint_average_rep_mse = accumulated_rep_mse / (
                            len(data_loader))
                        checkpoint_average_total_mse = checkpoint_average_mse + checkpoint_average_rep_mse

                        checkpoint_average_perception_rep_error = accumulated_perception_rep_error / (
                            len(data_loader))
                        checkpoint_average_speed_rep_error = accumulated_speed_rep_error / (
                            len(data_loader))
                        checkpoint_average_intentions_rep_error = accumulated_intentions_rep_error / (
                            len(data_loader))
                        checkpoint_average_rep_error = accumulated_rep_error / (
                            len(data_loader))
                        checkpoint_average_total_error = checkpoint_average_error + checkpoint_average_rep_mse

                        # Log L1/L2 loss terms
                        coil_logger.add_scalar(
                            'Perception Rep Loss',
                            checkpoint_average_perception_rep_mse, latest,
                            True)
                        coil_logger.add_scalar(
                            'Speed Rep Loss', checkpoint_average_speed_rep_mse,
                            latest, True)
                        coil_logger.add_scalar(
                            'Intentions Rep Loss',
                            checkpoint_average_intentions_rep_mse, latest,
                            True)
                        coil_logger.add_scalar('Overall Rep Loss',
                                               checkpoint_average_rep_mse,
                                               latest, True)
                        coil_logger.add_scalar('Total L2 Loss',
                                               checkpoint_average_total_mse,
                                               latest, True)

                        coil_logger.add_scalar(
                            'Perception Rep Error',
                            checkpoint_average_perception_rep_error, latest,
                            True)
                        coil_logger.add_scalar(
                            'Speed Rep Error',
                            checkpoint_average_speed_rep_error, latest, True)
                        coil_logger.add_scalar(
                            'Intentions Rep Error',
                            checkpoint_average_intentions_rep_error, latest,
                            True)
                        coil_logger.add_scalar('Total Rep Error',
                                               checkpoint_average_rep_error,
                                               latest, True)
                        coil_logger.add_scalar('Total Loss',
                                               checkpoint_average_total_error,
                                               latest, True)
                    else:
                        checkpoint_average_total_mse = checkpoint_average_mse
                        checkpoint_average_total_error = checkpoint_average_error

                    if checkpoint_average_total_mse < best_mse:
                        best_mse = checkpoint_average_total_mse
                        best_mse_iter = latest

                    if checkpoint_average_total_error < best_error:
                        best_error = checkpoint_average_total_error
                        best_error_iter = latest

                    # Print for logging / to terminal validation results
                    if g_conf.USE_REPRESENTATION_LOSS:
                        coil_logger.add_message(
                            'Iterating', {
                                'Summary': {
                                    'Control Error': checkpoint_average_error,
                                    'Control Loss': checkpoint_average_mse,
                                    'Rep Error': checkpoint_average_rep_error,
                                    'Rep Loss': checkpoint_average_rep_mse,
                                    'Error': checkpoint_average_total_error,
                                    'Loss': checkpoint_average_total_mse,
                                    'BestError': best_error,
                                    'BestMSE': best_mse,
                                    'BestMSECheckpoint': best_mse_iter,
                                    'BestErrorCheckpoint': best_error_iter
                                },
                                'Checkpoint': latest
                            }, latest)
                    else:
                        coil_logger.add_message(
                            'Iterating', {
                                'Summary': {
                                    'Error': checkpoint_average_error,
                                    'Loss': checkpoint_average_mse,
                                    'BestError': best_error,
                                    'BestMSE': best_mse,
                                    'BestMSECheckpoint': best_mse_iter,
                                    'BestErrorCheckpoint': best_error_iter
                                },
                                'Checkpoint': latest
                            }, latest)

                    # Save validation loss history (validation loss is used for early stopping)
                    l1_window.append(checkpoint_average_total_error)
                    coil_logger.write_on_error_csv(
                        dataset_name, checkpoint_average_total_error)

                    # Early stopping
                    if g_conf.FINISH_ON_VALIDATION_STALE is not None:
                        if dlib.count_steps_without_decrease(l1_window) > 3 and \
                                dlib.count_steps_without_decrease_robust(l1_window) > 3:
                            coil_logger.write_stop(dataset_name, latest)
                            break

            else:

                latest = get_latest_evaluated_checkpoint()
                time.sleep(1)

                coil_logger.add_message('Loading',
                                        {'Message': 'Waiting Checkpoint'})
                print("Waiting for the next Validation")

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

    except RuntimeError as e:
        if latest is not None:
            coil_logger.erase_csv(latest)
        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: run_drive.py Projeto: nessyht/coiltraine_colab

def execute(gpu, exp_batch, exp_alias, drive_conditions, params):
    """
    Main loop function. Executes driving benchmarks the specified iterations.
    Args:
        gpu:
        exp_batch:
        exp_alias:
        drive_conditions:
        params:

    Returns:

    """

    try:
        print("Running ", __file__, " On GPU ", gpu, "of experiment name ",
              exp_alias)
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu
        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))

        exp_set_name, town_name = drive_conditions.split('_')

        experiment_suite_module = __import__(
            'drive.suites.' + camelcase_to_snakecase(exp_set_name) + '_suite',
            fromlist=[exp_set_name])
        experiment_suite_module = getattr(experiment_suite_module,
                                          exp_set_name)

        experiment_set = experiment_suite_module()

        set_type_of_process('drive', drive_conditions)

        if params['suppress_output']:
            sys.stdout = open(os.path.join(
                '_output_logs',
                g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)
            sys.stderr = open(os.path.join(
                '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME +
                '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        coil_logger.add_message(
            'Loading', {'Poses': experiment_set.build_experiments()[0].poses})
        if g_conf.USE_ORACLE:
            control_filename = 'control_output_auto'
        else:
            control_filename = 'control_output'
        """
            #####
            Preparing the output files that will contain the driving summary
            #####
        """
        experiment_list = experiment_set.build_experiments()
        # Get all the uniquely named tasks
        task_list = unique(
            [experiment.task_name for experiment in experiment_list])
        # Now actually run the driving_benchmark

        latest = get_latest_evaluated_checkpoint(control_filename + '_' +
                                                 task_list[0])

        if latest is None:  # When nothing was tested, get latest returns none, we fix that.
            latest = 0
            # The used tasks are hardcoded, this need to be improved
            file_base = os.path.join('_logs', exp_batch, exp_alias,
                                     g_conf.PROCESS_NAME + '_csv',
                                     control_filename)

            for i in range(len(task_list)):
                # Write the header of the summary file used conclusion
                # While the checkpoint is not there
                write_header_control_summary(file_base, task_list[i])
        """ 
            ######
            Run a single driving benchmark specified by the checkpoint were validation is stale
            ######
        """

        if g_conf.FINISH_ON_VALIDATION_STALE is not None:

            while validation_stale_point(
                    g_conf.FINISH_ON_VALIDATION_STALE) is None:
                time.sleep(0.1)

            validation_state_iteration = validation_stale_point(
                g_conf.FINISH_ON_VALIDATION_STALE)
            driving_benchmark(validation_state_iteration, gpu, town_name,
                              experiment_set, exp_batch, exp_alias, params,
                              control_filename, task_list)

        else:
            """
            #####
            Main Loop , Run a benchmark for each specified checkpoint on the "Test Configuration"
            #####
            """
            while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):
                # Get the correct checkpoint
                # We check it for some task name, all of then are ready at the same time
                if is_next_checkpoint_ready(
                        g_conf.TEST_SCHEDULE,
                        control_filename + '_' + task_list[0]):

                    latest = get_next_checkpoint(
                        g_conf.TEST_SCHEDULE,
                        control_filename + '_' + task_list[0])

                    driving_benchmark(latest, gpu, town_name, experiment_set,
                                      exp_batch, exp_alias, params,
                                      control_filename, task_list)

                else:
                    time.sleep(0.1)

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something happened'})

Exemplo n.º 5

0

Exibir arquivo

def execute(gpu, exp_batch, exp_alias, dataset_name, validation_set=False):
    latest = None
    # We set the visible cuda devices
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    g_conf.immutable(False)
    # At this point the log file with the correct naming is created.
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    # If using validation dataset, fix a very high number of hours
    if validation_set:
        g_conf.NUMBER_OF_HOURS = 10000
    g_conf.immutable(True)

    # Define the dataset.
    full_dataset = [
        os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name)
    ]
    augmenter = Augmenter(None)
    if validation_set:
        # Definition of the dataset to be used. Preload name is just the validation data name
        dataset = CoILDataset(full_dataset,
                              transform=augmenter,
                              preload_names=[dataset_name])
    else:
        dataset = CoILDataset(full_dataset,
                              transform=augmenter,
                              preload_names=[
                                  str(g_conf.NUMBER_OF_HOURS) + 'hours_' +
                                  dataset_name
                              ],
                              train_dataset=True)

    # The data loader is the multi threaded module from pytorch that release a number of
    # workers to get all the data.
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=g_conf.BATCH_SIZE,
        shuffle=False,
        num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
        pin_memory=True)

    # Define model
    model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
    """ 
        ######
        Run a single driving benchmark specified by the checkpoint were validation is stale
        ######
    """

    if g_conf.FINISH_ON_VALIDATION_STALE is not None:

        while validation_stale_point(
                g_conf.FINISH_ON_VALIDATION_STALE) is None:
            time.sleep(0.1)

        validation_state_iteration = validation_stale_point(
            g_conf.FINISH_ON_VALIDATION_STALE)

        checkpoint = torch.load(
            os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                         str(validation_state_iteration) + '.pth'))
        print("Validation loaded ", validation_state_iteration)
    else:
        """
        #####
        Main Loop , Run a benchmark for each specified checkpoint on the "Test Configuration"
        #####
        """
        while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):
            # Get the correct checkpoint
            # We check it for some task name, all of then are ready at the same time
            if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE,
                                        control_filename + '_' + task_list[0]):

                latest = get_next_checkpoint(
                    g_conf.TEST_SCHEDULE,
                    control_filename + '_' + task_list[0])

                checkpoint = torch.load(
                    os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                                 str(latest) + '.pth'))
                print("Validation loaded ", latest)
            else:
                time.sleep(0.1)

    # Load the model and prepare set it for evaluation
    model.load_state_dict(checkpoint['state_dict'])
    model.cuda()
    model.eval()

    first_iter = True
    for data in data_loader:

        # Compute the forward pass on a batch from the dataset and get the intermediate
        # representations of the squeeze network
        if "seg" in g_conf.SENSORS.keys():
            perception_rep, speed_rep, intentions_rep = \
                model.get_intermediate_representations(data,
                                                       dataset.extract_inputs(data).cuda(),
                                                       dataset.extract_intentions(data).cuda())
            perception_rep = perception_rep.data.cpu()
            speed_rep = speed_rep.data.cpu()
            intentions_rep = intentions_rep.data.cpu()
        if first_iter:
            perception_rep_all = perception_rep
            speed_rep_all = speed_rep
            intentions_rep_all = intentions_rep
        else:
            perception_rep_all = torch.cat(
                [perception_rep_all, perception_rep], 0)
            speed_rep_all = torch.cat([speed_rep_all, speed_rep], 0)
            intentions_rep_all = torch.cat(
                [intentions_rep_all, intentions_rep], 0)
        first_iter = False

    # Save intermediate representations
    perception_rep_all = perception_rep_all.tolist()
    speed_rep_all = speed_rep_all.tolist()
    intentions_rep_all = intentions_rep_all.tolist()
    np.save(
        os.path.join(
            '_preloads', exp_batch + '_' + exp_alias + '_' + dataset_name +
            '_representations'),
        [perception_rep_all, speed_rep_all, intentions_rep_all])