def test_records_sample(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)

    records_1 = dataset.records(spark_context,
                                decode=lambda x: x,
                                sample=0.1,
                                seed=None).collect()
    assert len(records_1) == 10

    records_2 = dataset.records(spark_context,
                                decode=lambda x: x,
                                sample=0.1,
                                seed=None).collect()

    # The sampling seed is different, so we have two different samples.
    assert records_1 != records_2

    records_1 = dataset.records(spark_context, decode=lambda x: x,
                                sample=0.1).collect()
    records_2 = dataset.records(spark_context, decode=lambda x: x,
                                sample=0.1).collect()

    # Same seed, same sample.
    assert records_1 == records_2
示例#2
0
def dataset():
    bucket_name = 'test_bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = json.dumps({'foo': 1})
    store.store['dir2/subdir2/key2'] = json.dumps({'foo': 2})
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    return dataset
def test_scan_multiple_where_params(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir1/another-dir/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1='dir1', dim2='subdir1')
    summaries = dataset.summaries(spark_context)
    expected_key = 'dir1/subdir1/key1'
    assert summaries == [{'key': expected_key, 'size': len(store.store[expected_key])}]
def test_scan_multiple_where_params(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir1/another-dir/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1='dir1', dim2='subdir1')
    summaries = dataset.summaries(spark_context)
    expected_key = 'dir1/subdir1/key1'
    assert summaries == [{'key': expected_key, 'size': len(store.store[expected_key])}]
def test_records_limit_and_sample(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x, limit=5, sample=0.9)
    assert records.count() == 5
def test_records(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x)
    records = sorted(records.collect())

    assert records == [b'value1', b'value2']
def test_records(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x)
    records = records.collect()

    assert records == [b'value1', b'value2']
def test_records_limit(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x, limit=5)
    assert records.count() == 5
示例#9
0
def test_scan_with_prefix():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['prefix1/dir1/subdir1/key1'] = 'value1'
    store.store['prefix2/dir2/another-dir/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'],
                      clauses={'dim1': lambda x: x == 'dir1'}, store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'dim2',], ['prefix1/',], dataset.clauses, executor)
    assert list(folders) == ['prefix1/dir1/']
示例#10
0
def test_records_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)
    records = dataset.records(spark_context, decode=lambda x: x,
                              summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}])
    records = records.collect()

    assert records == [b'value1']
示例#11
0
def test_summaries_with_limit(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)
    summaries = dataset.summaries(spark_context, 1)

    assert len(summaries) == 1

    assert summaries[0]['key'] in store.store
示例#12
0
def test_scan_no_clause():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/dir2/key1'
    value = 'value1'
    store.store[key] = value

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'subdir'], ['prefix'], {}, executor)
    assert list(folders) == ['prefix']
def test_scan_with_clause():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/another-dir/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'],
                      clauses={'dim1': lambda x: x == 'dir1'}, store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'dim2'], [''], dataset.clauses, executor)
    assert list(folders) == ['dir1/']
示例#14
0
def test_summaries_with_limit():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    summaries = list(dataset._summaries(1))

    assert len(summaries) == 1

    assert summaries[0]['key'] in store.store
示例#15
0
def test_summaries_with_limit(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    summaries = dataset.summaries(spark_context, 1)

    assert len(summaries) == 1

    assert summaries[0]['key'] in store.store
示例#16
0
def test_records_print_output(spark_context, capsys):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    dataset.records(spark_context, decode=lambda x: x)
    out, err = capsys.readouterr()
    assert out.rstrip() == "fetching 0.00066MB in 100 files..."
def test_records_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x,
                              summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}])
    records = records.collect()

    assert records == ['value1']
def test_records_print_output(spark_context, capsys):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100+1):
        key = 'dir{}/subdir{}/key{}'.format(*[i]*3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    dataset.records(spark_context, decode=lambda x: x)
    out, err = capsys.readouterr()
    assert out.rstrip() == "fetching 0.00066MB in 100 files..."
示例#19
0
def test_scan_no_clause():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/dir2/key1'
    value = 'value1'
    store.store[key] = value

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    with futures.ProcessPoolExecutor(1) as executor:
        folders = dataset._scan(['dim1', 'subdir'], ['prefix'], {}, executor)
    assert list(folders) == ['prefix']
示例#20
0
def test_records_object(spark_context):
    expect = {"uid": 1}

    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['key'] = json.dumps(expect)

    ds = Dataset(bucket_name, None, store=store, max_concurrency=1)
    row = ds.records(spark_context, decode=decode).first()

    assert isinstance(row, dict)
    assert row == expect
示例#21
0
def test_sanitized_dimensions(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir_1/subdir1/key1'] = 'value1'
    store.store['dir_1/subdir2/key2'] = 'value2'
    store.store['dir_2/subdir3/key3'] = 'value3'
    store.store['dir_3/subdir4/key4'] = 'value4'

    dataset = (Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)
               .where(dim1="dir-1"))

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2
def test_records_selection(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/subdir1/key1'
    value = '{"a": {"b": { "c": "value"}}}'
    store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c')
    records = dataset.records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value'}]

    # Check that concatenating `select`s works as expected
    records = dataset.select(field2='a.b').records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
示例#23
0
def test_records_selection(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    key = 'dir1/subdir1/key1'
    value = '{"a": {"b": { "c": "value"}}}'
    store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c')
    records = dataset.records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value'}]

    # Check that concatenating `select`s works as expected
    records = dataset.select(field2='a.b').records(spark_context, decode=decode)
    assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
示例#24
0
def test_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])
示例#25
0
def test_summaries():
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)

    summaries = dataset._summaries()
    assert len(list(summaries)) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])
示例#26
0
def test_summaries(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir1/subdir1/key1'] = 'value1'
    store.store['dir2/subdir2/key2'] = 'value2'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1)

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])
示例#27
0
def test_prefix_slash(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['a/b/dir1/subdir1/key1'] = 'value1'
    store.store['a/b/dir2/subdir2/key2'] = 'value2'
    store.store['x/b/dir3/subdir3/key3'] = 'value3'
    store.store['a/c/dir4/subdir4/key4'] = 'value4'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, prefix='a/b', max_concurrency=1)

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])

    # be sure "where" still works
    summaries_filtered = dataset.where(dim1='dir1').summaries(spark_context)
    assert len(summaries_filtered) == 1
    assert summaries_filtered[0]['key'] == 'a/b/dir1/subdir1/key1'
示例#28
0
def test_store(monkeypatch, data_dir):
    with open(os.path.join(data_dir, 'schema.json')) as s:
        schema = json.loads(s.read())
    dimensions = [f['field_name'] for f in schema['dimensions']]
    dataset = Dataset('test-bucket', dimensions, InMemoryStore('test-bucket'))

    @staticmethod
    def from_source(source_name):
        return dataset

    monkeypatch.setattr(Dataset, 'from_source', from_source)

    return dataset.store
def test_records_many_groups(spark_context, monkeypatch):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, spark_context.defaultParallelism + 2):
        store.store['dir1/subdir1/key{}'.format(i)] = 'value{}'.format(i)
    # create one group per item
    monkeypatch.setattr(moztelemetry.dataset, '_group_by_size_greedy',
                        lambda x, _: [[y] for y in x])
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)
    records = dataset.records(spark_context, decode=lambda x: x)
    records = records.collect()

    assert records == ['value{}'.format(i) for i in range(1, spark_context.defaultParallelism + 2)]
示例#30
0
def test_records_sample(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    for i in range(1, 100 + 1):
        key = 'dir{}/subdir{}/key{}'.format(*[i] * 3)
        value = 'value{}'.format(i)
        store.store[key] = value
    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store)

    records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect()
    assert len(records_1) == 10

    records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect()

    # The sampling seed is different, so we have two different samples.
    assert sorted(records_1) != sorted(records_2)

    records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect()
    records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect()

    # Same seed, same sample.
    assert sorted(records_1) == sorted(records_2)
def test_sanitized_dimensions(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['dir_1/subdir1/key1'] = 'value1'
    store.store['dir_1/subdir2/key2'] = 'value2'
    store.store['dir_2/subdir3/key3'] = 'value3'
    store.store['dir_3/subdir4/key4'] = 'value4'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1="dir-1")

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2
示例#32
0
def test_prefix_slash(spark_context):
    bucket_name = 'test-bucket'
    store = InMemoryStore(bucket_name)
    store.store['a/b/dir1/subdir1/key1'] = 'value1'
    store.store['a/b/dir2/subdir2/key2'] = 'value2'
    store.store['x/b/dir3/subdir3/key3'] = 'value3'
    store.store['a/c/dir4/subdir4/key4'] = 'value4'

    dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, prefix='a/b')

    summaries = dataset.summaries(spark_context)
    assert len(summaries) == 2

    for item in summaries:
        assert item['key'] in store.store
        assert item['size'] == len(store.store[item['key']])

    # be sure "where" still works
    summaries_filtered = dataset.where(dim1='dir1').summaries(spark_context)
    assert len(summaries_filtered) == 1
    assert summaries_filtered[0]['key'] == 'a/b/dir1/subdir1/key1'