Exemplo n.º 1
0
def test_check_data_in_sync_raises():
    with pytest.raises(ValueError):
        params = {
            'data': blueno.DataConfig(**{
                'data_dir': '/home/lzhu7/elvo-analysis/data/'
                            'processed-standard/arrays/',
                'labels_path': '/home/lzhu7/elvo-analysis/data/'
                               'processed-standard/labels.csv',
                'index_col': 'Anon ID',
                'label_col': 'occlusion_exists',
                'gcs_url': 'gs://elvos/processed/processed',
            }),

            'val_split': 0.2,
            'seed': 0,
            'batch_size': 8,
            'max_epochs': 1,

            'generator': blueno.GeneratorConfig(
                generator_callable=generators.luke.standard_generators),

            'model': blueno.ModelConfig(**{
                # The callable must take in **kwargs as an argument
                'model_callable': small_model,
                'dropout_rate1': 0.8,
                'dropout_rate2': 0.7,
                'optimizer': keras.optimizers.Adam(lr=1e-4),
                'loss': keras.losses.categorical_crossentropy,
            }),
        }
        params = blueno.ParamConfig(**params)

        bluenot.check_data_in_sync(params)
Exemplo n.º 2
0
def test_prepare_data_correct_dims():
    params = {
        'data': blueno.DataConfig(**{
            'data_dir': '/home/lzhu7/elvo-analysis/data/'
                        'processed-standard/arrays/',
            'labels_path': '/home/lzhu7/elvo-analysis/data/'
                           'processed-standard/labels.csv',
            'index_col': 'Anon ID',
            'label_col': 'occlusion_exists',
            'gcs_url': 'gs://elvos/processed/processed-standard',
        }),

        'val_split': 0.2,
        'seed': 0,
        'batch_size': 8,

        'generator': blueno.GeneratorConfig(
            generator_callable=lambda: None),

        'model': blueno.ModelConfig(**{
            # The callable must take in **kwargs as an argument
            'model_callable': small_model,
            'dropout_rate1': 0.8,
            'dropout_rate2': 0.7,
            'optimizer': keras.optimizers.Adam(lr=1e-4),
            'loss': keras.losses.categorical_crossentropy,
        }),
    }
    params = blueno.ParamConfig(**params)
    _, _, y_train, y_test, _, _ = preprocessing.prepare_data(
        params, train_test_val=False)
    assert y_train.ndim == 2
    assert y_test.ndim == 2
Exemplo n.º 3
0
def test_start_job_log():
    x_train = np.random.uniform(0, 255, (100, 220, 220, 3))
    y_train = np.random.randint(0, 2, (100, 5))
    x_valid = np.random.uniform(0, 255, (20, 220, 220, 3))
    y_valid = np.random.randint(0, 2, (20, 5))
    params = {
        'data': blueno.DataConfig(**{
            'data_dir': '/home/lzhu7/elvo-analysis/data/'
                        'processed-standard/arrays/',
            'labels_path': '/home/lzhu7/elvo-analysis/data/'
                           'processed-standard/labels.csv',
            'index_col': 'Anon ID',
            'label_col': 'Location of occlusions on CTA (Matt verified)',
            'gcs_url': 'gs://elvos/processed/processed-standard',
        }),

        'val_split': 0.2,
        'seed': 0,
        'batch_size': 8,
        'max_epochs': 1,

        'generator': blueno.GeneratorConfig(
            generator_callable=generators.luke.standard_generators),

        'model': blueno.ModelConfig(**{
            # The callable must take in **kwargs as an argument
            'model_callable': small_model,
            'dropout_rate1': 0.8,
            'dropout_rate2': 0.7,
            'optimizer': keras.optimizers.Adam(lr=1e-4),
            'loss': keras.losses.categorical_crossentropy,
        }),
    }
    params = blueno.ParamConfig(**params)
    bluenot.start_job(x_train, y_train, x_valid, y_valid, job_name='test_job',
                      username='******', params=params,
                      log_dir='/tmp/')
    for filepath in glob.glob('/tmp/test_job*'):
        os.remove(filepath)
Exemplo n.º 4
0
def run_web_gpu1708_job(data_name: str, batch_size: int, val_split: float,
                        max_epochs: int, job_name: str, author_name: str):
    blueno_home = pathlib.Path('/home/lzhu7/elvo-analysis')

    data_dir = blueno_home / 'data'
    log_dir = blueno_home / 'logs'

    param_config = blueno.ParamConfig(
        data=blueno.DataConfig(
            data_dir=str(data_dir / data_name / 'arrays'),
            labels_path=str(data_dir / data_name / 'labels.csv'),
            index_col='Anon ID',
            label_col='occlusion_exists',
            gcs_url=f'gs://elvos/processed/{data_name}',
        ),
        generator=blueno.GeneratorConfig(
            generator_callable=generators.luke.standard_generators, ),
        model=blueno.ModelConfig(
            model_callable=models.luke.resnet,
            optimizer=keras.optimizers.Adam(lr=1e-5),
            loss=keras.losses.categorical_crossentropy,
        ),
        batch_size=int(batch_size),
        seed=0,
        val_split=float(val_split),
        early_stopping=False,
        max_epochs=int(max_epochs),
        job_name=job_name,
    )

    logging.info('training web job {}'.format(param_config))

    bluenot.hyperoptimize(
        [param_config],
        author_name,
        num_gpus=1,
        gpu_offset=3,
        log_dir=str(log_dir),
    )
Exemplo n.º 5
0
def load_training_xy(data_root):
    # Minimal params to generate the correct training-validation data
    # split
    params = blueno.ParamConfig(
        data=blueno.DataConfig(
            data_dir='{}/arrays'.format(data_root),
            labels_path='{}/labels.csv'.format(data_root),
            index_col='Anon ID',
            label_col='occlusion_exists',
            gcs_url='gs://elvos/processed/processed-new-training-2'),
        generator=blueno.GeneratorConfig(
            generator_callable=standard_generators),
        model=blueno.ModelConfig(model_callable=None,
                                 optimizer=None,
                                 loss=categorical_crossentropy,
                                 dropout_rate1=None,
                                 dropout_rate2=None),
        batch_size=None,
        seed=0,  # or 0
        val_split=0.1,
    )
    arrays = prepare_data(params, train_test_val=False)
    return arrays[0], arrays[2]
Exemplo n.º 6
0
def test_prepare_and_job():
    params = {
        'data': blueno.DataConfig(**{
            'data_dir': '/home/lzhu7/elvo-analysis/data/'
                        'processed-standard/arrays/',
            'labels_path': '/home/lzhu7/elvo-analysis/data/'
                           'processed-standard/labels.csv',
            'index_col': 'Anon ID',
            'label_col': 'occlusion_exists',
            'gcs_url': 'gs://elvos/processed/processed-standard',
        }),

        'val_split': 0.2,
        'seed': 0,
        'batch_size': 8,
        'max_epochs': 1,

        'generator': blueno.GeneratorConfig(
            generator_callable=generators.luke.standard_generators),

        'model': blueno.ModelConfig(**{
            # The callable must take in **kwargs as an argument
            'model_callable': small_model,
            'dropout_rate1': 0.8,
            'dropout_rate2': 0.7,
            'optimizer': keras.optimizers.Adam(lr=1e-4),
            'loss': keras.losses.categorical_crossentropy,
        }),
    }

    params = blueno.ParamConfig(**params)
    x_train, x_valid, y_train, y_valid, _, _ = \
        bluenot.preprocessing.prepare_data(params, train_test_val=False)

    bluenot.start_job(x_train, y_train, x_valid, y_valid,
                      job_name='test_prepare_and_job', username='******',
                      params=params)
Exemplo n.º 7
0
def test_prepare_data_matching_indices():
    params = {
        'data': blueno.DataConfig(**{
            'data_dir': '/home/lzhu7/elvo-analysis/data/'
                        'processed-standard/arrays/',
            'labels_path': '/home/lzhu7/elvo-analysis/data/'
                           'processed-standard/labels.csv',
            'index_col': 'Anon ID',
            'label_col': 'occlusion_exists',
            'gcs_url': 'gs://elvos/processed/processed-standard'
        }),

        'val_split': 0.2,
        'seed': 0,
        'batch_size': 8,
        'max_epochs': 1,

        'generator': blueno.GeneratorConfig(
            generator_callable=lambda: None),

        'model': blueno.ModelConfig(**{
            # The callable must take in **kwargs as an argument
            'model_callable': small_model,
            'dropout_rate1': 0.8,
            'dropout_rate2': 0.7,
            'optimizer': keras.optimizers.Adam(lr=1e-4),
            'loss': keras.losses.categorical_crossentropy,
        }),
    }
    params = blueno.ParamConfig(**params)
    _, _, y_train, y_test, id_train, id_test = preprocessing.prepare_data(
        params, train_test_val=False)
    for i, id_ in enumerate(id_test):
        if id_ == '068WBWCQGW5JHBYV':
            assert y_test[i][0] == 1
        elif id_ == 'FBGMN3O08GW5GG91':
            assert y_test[i][1] == 1
Exemplo n.º 8
0
def simple_ensemble(model_blob_names: List[str],
                    data_dir: str,
                    labels_path: str,
                    loss: Callable,
                    seed: int,
                    val_split: float,
                    train_test_val: bool,
                    sort: bool):
    """Creates an ensemble from the list of model_urls.

    DO NOT mix models in compat/ with those in sorted_models/.

    :param model_blob_names: a list of model blob names, like
        compat/
    :param data_dir: the path to the data used by ALL models
    :param labels_path: the path to the labels used by ALL models
    :param loss: a loss function like keras.losses.categorical_crossentropy,
        used by ALL models
    :param seed: the seed of ALL of the models
    :param val_split: the val split of ALL of the models
    :param train_test_val: True if train_test_val split was used on the models
    :param sort: set to True of you are loading from sorted_models/, false
        if loading from compat/
    :return:
    """

    # Set the params variable from the function arguments,
    # we'll need this to load the data as x_train, y_train, ...
    params = blueno.ParamConfig(
        data=blueno.DataConfig(
            # TODO: Generalize to work for all users
            data_dir=data_dir,
            labels_path=labels_path,
            index_col='Anon ID',
            label_col='occlusion_exists',
            gcs_url='',
        ),
        generator=None,
        model=blueno.ModelConfig(
            model_callable=None,
            optimizer=None,
            # TODO: Some may use a different loss
            loss=loss,
        ),
        batch_size=None,
        seed=seed,
        val_split=val_split
    )

    x_train, x_valid, y_train, y_valid, _, _ = prepare_data(
        params, train_test_val=train_test_val, sort=sort)
    datagen = ImageDataGenerator(featurewise_center=True,
                                 featurewise_std_normalization=True)
    datagen.fit(x_train)

    client = storage.Client(project='elvo-198322')
    bucket = storage.Bucket(client, name='elvos')

    # This is a copy of the ensemble_models function, using
    # model names instead.
    models = []
    time1 = time.time()
    for i, blob_name in enumerate(model_blob_names):
        # Here we load and evaluate each individual model
        # so we can be sure that our data, validation split, and seed
        # are correct
        blob = bucket.get_blob(blob_name)
        if blob is None:
            raise ValueError(f'Blob {blob_name} does not exist')
        model_filepath = f'{i}.hdf5'

        print(f'downloading model {blob_name}')
        time2 = time.time()
        blob.download_to_filename(model_filepath)
        time3 = time.time()
        print(f'seconds to download: {time3 - time2}')

        print(f'loading model {blob_name}')
        model: keras.Model
        model = load_model(model_filepath, compile=True)
        os.remove(model_filepath)
        time4 = time.time()
        print(f'seconds to load: {time4 - time3}')
        # Used to check the model
        evaluate_model(model, datagen, x_valid, y_valid)

        model.name = f'model_{i}'
        models.append(model)

    # Finally we ensemble and evaluate the models here
    print('using models {}'.format(models))
    model_input = layers.Input(shape=models[0].input_shape[1:])
    ensemble = ensemble_models(models, model_input)

    evaluate_model(ensemble, datagen, x_valid, y_valid)
    time7 = time.time()
    print(f'seconds per ensemble: {time7 - time1}', flush=True)
Exemplo n.º 9
0
        ],
        'loss': [
            keras.losses.categorical_crossentropy,
        ],
        'freeze': [False],
    }))

model_list = [blueno.ModelConfig(**m) for m in model_list]

PARAM_GRID = model_selection.ParameterGrid({
    'data': [
        blueno.DataConfig(
            data_dir=str(
                pathlib.Path(DATA_DIR) / 'processed-new-training-2/arrays/'),
            labels_path=str(
                pathlib.Path(DATA_DIR) /
                'processed-new-training-2/labels.csv'),
            index_col='Anon ID',
            label_col='occlusion_exists',
            gcs_url='gs://elvos/processed/processed-new-training-2')
    ],
    'generator': [
        blueno.GeneratorConfig(
            generator_callable=generators.luke.standard_generators,
            rotation_range=30)
    ],
    'model':
    model_list,
    'batch_size': [5],
    'seed': [0],
    'val_split': [0.1],  # So we run the grid 16 times
Exemplo n.º 10
0
    }))

model_list = [blueno.ModelConfig(**m) for m in model_list]

data_list = list(
    model_selection.ParameterGrid({
        'data_dir':
        [str(pathlib.Path(DATA_DIR) / 'processed-lower' / 'arrays')],
        'labels_path':
        [str(pathlib.Path(DATA_DIR) / 'processed-lower' / 'labels.csv')],
        'index_col': ['Anon ID'],
        'label_col': ['occlusion_exists'],
        'gcs_url': ['gs://elvos/processed/processed-lower'],
    }))

data_list = [blueno.DataConfig(**d) for d in data_list]

PARAM_GRID = model_selection.ParameterGrid({
    'data':
    data_list,
    'generator': [
        blueno.GeneratorConfig(
            generator_callable=generators.luke.standard_generators,
            rotation_range=30)
    ],
    'model':
    model_list,
    'batch_size': [8],
    'seed': [0, 1, 2, 3, 4, 5],
    'val_split': [0.1, 0.2, 0.3],
    'reduce_lr': [True, False],