예제 #1
0
if __name__ == '__main__':
    argparser = argparse.ArgumentParser(description=__doc__)

    argparser.add_argument(
        '--check-status',
        action='store_true',
        dest='check_status',
    )
    argparser.add_argument('--folder', default='eccv', type=str)
    argparser.add_argument('--erase-experiments',
                           nargs='+',
                           dest='gpus',
                           type=str)

    args = argparser.parse_args()

    # Obs this is like a fixed parameter, how much a validation and a train and drives ocupies

    if args.check_status:
        validation_datasets = get_validation_datasets(args.folder)
        drive_environments = get_driving_environments(args.folder)
        printer.plot_folder_summaries(args.folder,
                                      True,
                                      validation_datasets,
                                      drive_environments,
                                      verbose=False)

    if args.erase_experiments:
        pass
예제 #2
0
def folder_execute(params=None):
    """
    Execute a folder of experiments. It will execute trainings and
    all the selected evaluations for each of the models present on the folder.

    Args
        params: a dictionary containing:
            gpus: the gpu numbers that are going  to be allocated for the experiment
            gpu_value: the "value" of each gpu, depending on the value more or less experiments
                        will be allocated per GPU
            folder: the folder where all the experiment configuration files are
            validation_datasets: the validation datasets that are going to be validated
                                 per experiment
            driving_environments: The driving environments where the models are going to be tested.

    """

    folder = params['folder']
    allocated_gpus = params['gpus']
    validation_datasets = params['validation_datasets']
    driving_environments = params['driving_environments']
    allocation_parameters = params['allocation_parameters']

    experiments_list = os.listdir(os.path.join('configs', folder))
    experiments_list = [
        experiment.split('.')[-2] for experiment in experiments_list
    ]

    allocated_gpus = {
        gpu: allocation_parameters['gpu_value']
        for gpu in allocated_gpus
    }

    executing_processes = []

    free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources(
        allocated_gpus, executing_processes, allocation_parameters)

    # Is a queue of tasks to be executed. The priority is always train.
    # then test then val.
    tasks_queue = mount_experiment_heap(folder, experiments_list,
                                        params['is_training'], [], [],
                                        validation_datasets,
                                        driving_environments)

    # No process is executing right now.

    while True:
        #   if not done or executing  get to the list
        # If amount of resources is smaller than a threshold.

        while resources_on_most_free_gpu >= min([allocation_parameters['train_cost'],
                                                 allocation_parameters['validation_cost'],
                                                 allocation_parameters['drive_cost']]) \
                and tasks_queue != []:
            # Allocate all the gpus
            popped_thing = heapq.heappop(tasks_queue)
            process_specs = popped_thing[2]  # To get directly the dict

            # Get the train status, that will affect in scheduling a validation or drive process
            train_status = monitorer.get_status(folder,
                                                process_specs['experiment'],
                                                'train')[0]
            # ADD TRAIN TO EXECUTE
            if process_specs['type'] == 'train' and resources_on_most_free_gpu >= \
                    allocation_parameters['train_cost']:
                free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources(
                    free_gpus, allocation_parameters['train_cost'])

                execute_train(gpu_number, process_specs['folder'],
                              process_specs['experiment'],
                              params['number_of_workers'])
                process_specs.update({'gpu': gpu_number})

                executing_processes.append(process_specs)
            # ADD DRIVE TO EXECUTE
            elif process_specs['type'] == 'drive' and resources_on_most_free_gpu >= \
                    allocation_parameters['drive_cost'] \
                    and (train_status == 'Iterating' or train_status == 'Loading' or
                         train_status == 'Finished'):
                print(process_specs['type'])
                free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources(
                    free_gpus, allocation_parameters['drive_cost'])
                execute_drive(gpu_number, process_specs['folder'],
                              process_specs['experiment'],
                              process_specs['environment'],
                              params['driving_parameters'])
                process_specs.update({'gpu': gpu_number})
                executing_processes.append(process_specs)
            # ADD VALIDATION TO EXECUTE
            elif process_specs['type'] == 'validation' and resources_on_most_free_gpu >= \
                    allocation_parameters['validation_cost'] \
                    and (train_status == 'Iterating' or train_status == 'Loading' or
                         train_status == 'Finished'):
                free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources(
                    free_gpus, allocation_parameters['validation_cost'])
                execute_validation(gpu_number, process_specs['folder'],
                                   process_specs['experiment'],
                                   process_specs['dataset'])
                process_specs.update({'gpu': gpu_number})
                executing_processes.append(process_specs)

        tasks_queue = mount_experiment_heap(folder, experiments_list,
                                            params['is_training'],
                                            executing_processes, tasks_queue,
                                            validation_datasets,
                                            driving_environments, False)

        printer.plot_folder_summaries(folder, params['is_training'],
                                      validation_datasets,
                                      driving_environments)
        # Check allocated process, and look which ones finished.

        if len(tasks_queue) == 0 and len(executing_processes) == 0:
            break

        free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources(
            allocated_gpus, executing_processes, allocation_parameters)

        time.sleep(10)

    print("ALL EXPERIMENTS EXECUTED")
예제 #3
0
def folder_execute(params=None):
    """
    On this mode the training software keeps all
    It forks a process to run the monitor over the training logs.
    Arguments
        param, prioritize training, prioritize test, prioritize
    """

    folder = params['folder']
    allocated_gpus = params['gpus']
    validation_datasets = params['validation_datasets']
    driving_environments = params['driving_environments']
    allocation_parameters = params['allocation_parameters']

    experiments_list = os.listdir(os.path.join('configs', folder))
    experiments_list = [experiment.split('.')[-2] for experiment in experiments_list]

    # Each gpu has maximun 2 slots

    allocated_gpus = {gpu: allocation_parameters['gpu_value'] for gpu in allocated_gpus}

    executing_processes = []

    free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources(allocated_gpus,
                                                                                   executing_processes,
                                                                                   allocation_parameters)

    # Is a queue of tasks to be executed. The priority is always train.
    # then test then val.
    # TODO: change the priority to test the ones that have already been trained.
    tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'],
                                        [], [],
                                        validation_datasets, driving_environments)

    # No process is executing right now.

    print(tasks_queue)

    # TODO: the while should go outside, so the monitorer process is independent of the type of execution

    while True:
        #        if not done or executing  get to the list
        # If amount of resources is smaller than a threshold.

        while resources_on_most_free_gpu >= min([allocation_parameters['train_cost'],
                                                 allocation_parameters['validation_cost'],
                                                 allocation_parameters['drive_cost']]) \
                and tasks_queue != []:
            # Allocate all the gpus
            popped_thing = heapq.heappop(tasks_queue)
            process_specs = popped_thing[2]  # To get directly the dict

            # Get the train status, that will affect in scheduling a validation or drive process
            train_status = monitorer.get_status(folder, process_specs['experiment'], 'train')[0]

            if process_specs['type'] == 'train' and resources_on_most_free_gpu >= \
                    allocation_parameters['train_cost']:
                free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources(
                    free_gpus,
                    allocation_parameters['train_cost'])

                execute_train(gpu_number, process_specs['folder'], process_specs['experiment'])
                process_specs.update({'gpu': gpu_number})

                executing_processes.append(process_specs)

            elif process_specs['type'] == 'validation' and resources_on_most_free_gpu >= \
                    allocation_parameters['validation_cost'] \
                    and (train_status == 'Iterating' or train_status == 'Loading' or
                         train_status == 'Finished'):
                free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources(
                                        free_gpus, allocation_parameters['validation_cost'])
                execute_validation(gpu_number, process_specs['folder'], process_specs['experiment'],
                                   process_specs['dataset'])
                process_specs.update({'gpu': gpu_number})
                executing_processes.append(process_specs)

            elif process_specs['type'] == 'drive' and resources_on_most_free_gpu >= \
                    allocation_parameters['drive_cost'] \
                    and (train_status == 'Iterating' or train_status == 'Loading' or
                         train_status == 'Finished'):
                free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources(
                                            free_gpus, allocation_parameters['drive_cost'])
                execute_drive(gpu_number, process_specs['folder'], process_specs['experiment'],
                              process_specs['environment'], no_screen=params['no_screen'])
                process_specs.update({'gpu': gpu_number})
                executing_processes.append(process_specs)






        tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'],
                                            executing_processes, tasks_queue,
                                            validation_datasets, driving_environments, False)

        printer.plot_folder_summaries(folder,
                                      params['is_training'],
                                      validation_datasets,
                                      driving_environments)
        # Check allocated process, and look which ones finished.
        free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources(
            allocated_gpus,
            executing_processes,
            allocation_parameters)

        if len(tasks_queue) == 0 and len(executing_processes) == 0:
            break
        print ("Task queue", tasks_queue)
        print ("")
        print ("exec proc", executing_processes)
        print("resources", free_gpus)
        time.sleep(10)

    print("ALL EXPERIMENTS EXECUTED")