Exemplo n.º 1
0
async def register_sample(client: blueno.PlatformClient,
                          dataset_name: str,
                          sample: Dict,
                          split_name: str,
                          sample_no: int):
    """
    Register the sample under file://data/{dataset_name}. The name
    of the sample is derived from the sample info if possible.
    """
    start = time.time()

    for key in sample:
        if key in ('filename', 'file_name',
                   'image/filename', 'image/file_name'):
            sample_name = decode(sample[key]).replace('/', '-')
            break
    else:
        sample_name = f'{split_name}-{sample_no}'

    logging.info(f'registering sample {sample_name}')

    arr = sample['image']
    if 'label' in sample:
        label = decode(sample['label'])
    else:
        label = None

    feature_info = {}
    for key in sample:
        if key != 'image':
            try:
                feature_info[key] = decode(sample[key])
            except TypeError as e:
                logging.info(f"Failed to decode feature '{key}': {e}")

    rel_sample_path = f'data/{dataset_name}/{sample_name}.npy'
    abs_sample_path = pathlib.Path(FILESYSTEM_STORE_ROOT) / rel_sample_path
    abs_sample_path.parent.mkdir(parents=True, exist_ok=True)
    numpy.save(abs_sample_path, arr)

    client.register_sample(
        sample_name,
        dataset_name,
        data_url=f'file://{rel_sample_path}',
        image_type='2D',
        label=label,
        split=split_name,
        other_info={
            'feature': feature_info,
        },
    )

    end = time.time()
    logging.info(f'registered sample {sample_name} in {end - start} seconds')
Exemplo n.º 2
0
def test_register_sample_validate(client: PlatformClient):
    dataset = 'blueno::test_register_sample_validate'

    client.create_dataset(dataset)
    with pytest.raises(PlatformError):
        client.register_sample(
            'test-sample',
            dataset,
            data_url='gs://elvo-platform/test/register_validate/no-data.xzx',
            validate=True,
            split='training')
    assert len(client.list_samples(dataset)) == 0
Exemplo n.º 3
0
def register():
    platform_client = PlatformClient(API_SERVER, EMAIL, PASSWORD)
    platform_client.create_dataset(
        DATASET_NAME, description="Raw singlephase ELVO scans in NPY form.")

    gcs_client = storage.Client()
    bucket = gcs_client.get_bucket('elvo-platform')
    blob: storage.Blob
    for blob in bucket.list_blobs(prefix='elvo/raw/numpy/'):
        if not blob.name.endswith('.npy'):
            continue

        filename = blob.name.split('/')[-1]
        sample_name = filename[:-len('.npy')]
        gcs_url = f'gs://{bucket.name}/{blob.name}'
        print(f"Registering sample={sample_name} with url={gcs_url}",
              flush=True)

        start = time.time()
        success = platform_client.register_sample(sample_name,
                                                  DATASET_NAME,
                                                  gcs_url,
                                                  image_type='3D')
        end = time.time()
        if success:
            print(f"Registered {sample_name} in {end - start} seconds")
        else:
            print(f"Found {sample_name} exists in {end - start} seconds")
Exemplo n.º 4
0
def load():
    client = PlatformClient(API_SERVER, EMAIL, PASSWORD)

    client.create_dataset(DATASET_NAME,
                          description="Raw multiphase ELVO scans in NPZ form.")

    dir: str
    files: List[str]
    for dir, _, files in os.walk(
            '/research/rih-cs/datasets/elvo-multiphase/v1.0'):
        for file in files:
            if file.endswith('.npz'):
                sample_name = file.split('.')[0]
                label = 'positive' if file.startswith('P') else 'negative'
                data_url = f'{DATA_PREFIX}/{file}'
                print(f"Registering sample={sample_name} with"
                      f" label={label} and url={data_url}", flush=True)
                start = time.time()
                success = client.register_sample(
                    sample_name,
                    DATASET_NAME,
                    data_url=data_url,
                    image_type='3D',
                    label=label,
                )
                end = time.time()
                if success:
                    print(f"Registered {file} in {end - start} seconds")
                else:
                    print(f"Found {file} exists in {end - start} seconds")
Exemplo n.º 5
0
def test_crud_samples(client: PlatformClient):
    dataset = 'blueno::test_crud_samples'
    samples = [
        'smaple1',
        'snapple2',
        'water3',
    ]

    client.create_dataset(dataset)

    # Attempt to create sample w/o data should pass
    assert client.register_sample(
        samples[0],
        dataset,
        data_url='file://test/crud_samples/no-data.xzx',
        validate=False,
        split='training')
    assert len(client.list_samples(dataset)) == 1
    # 2nd attempt to create w/ sample name should fail
    assert not client.register_sample(
        samples[0],
        dataset,
        data_url='file://test/crud_samples/no-data.xzx',
        validate=False,
        split='test')
    listed_samples = client.list_samples(dataset)
    assert len(listed_samples) == 1
    # 2nd attempt to create w/ sample name should not change 'info'
    assert listed_samples[0]['info']['split'] == 'training'

    # Attempt to create sample w/ data should pass
    assert client.register_sample(
        name=samples[1],
        dataset=dataset,
        data_url='file://test/crud_samples/with-data.txt',
        validate=False,
        split='training')
    listed_samples = client.list_samples(dataset)
    assert len(listed_samples) == 2

    # # Basic cleanup should work
    client.delete_sample(samples[0], dataset)
    client.delete_sample(samples[1], dataset)
    assert len(client.list_samples(dataset)) == 0

    client.delete_dataset(dataset)
Exemplo n.º 6
0
def register_mnist_az():
    client = PlatformClient(API_SERVER, EMAIL, PASSWORD)
    dataset_name = f'mnist-az'

    client.create_dataset(dataset_name,
                          description="MNIST on Azure in PNG form.")

    dir: str
    files: List[str]
    for dir, _, files in os.walk('mnist_png'):
        for file in files:
            if file.endswith('.png'):
                label = dir.split('/')[-1]
                split = dir.split('/')[1]
                start = time.time()
                sample_name = f"{file.split('.')[0]}-{split}"
                new_dir = dir.replace('mnist_png', 'data')
                data_url = f'az://ml-platform/{new_dir}/{file}'
                print(
                    f"Registering {sample_name} with label {label},"
                    f" split {split}, and data_url {data_url}",
                    flush=True)
                ret = client.register_sample(
                    sample_name,
                    dataset_name,
                    data_url=data_url,
                    validate=False,
                    label=label,
                    split=split,
                )
                end = time.time()
                if ret:
                    print(f"REGISTERED: processed {file}"
                          f" in {end - start} seconds")
                else:
                    print(f"ALREADY EXISTS: processed {file}"
                          f" in {end - start} seconds")