def test_records_sample(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect() assert len(records_1) == 10 records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect() # The sampling seed is different, so we have two different samples. assert records_1 != records_2 records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect() records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect() # Same seed, same sample. assert records_1 == records_2
def dataset(): bucket_name = 'test_bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = json.dumps({'foo': 1}) store.store['dir2/subdir2/key2'] = json.dumps({'foo': 2}) dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) return dataset
def test_scan_multiple_where_params(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir1/another-dir/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1='dir1', dim2='subdir1') summaries = dataset.summaries(spark_context) expected_key = 'dir1/subdir1/key1' assert summaries == [{'key': expected_key, 'size': len(store.store[expected_key])}]
def test_scan_multiple_where_params(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir1/another-dir/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1='dir1', dim2='subdir1') summaries = dataset.summaries(spark_context) expected_key = 'dir1/subdir1/key1' assert summaries == [{'key': expected_key, 'size': len(store.store[expected_key])}]
def test_records_limit_and_sample(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x, limit=5, sample=0.9) assert records.count() == 5
def test_records(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x) records = sorted(records.collect()) assert records == [b'value1', b'value2']
def test_records(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x) records = records.collect() assert records == [b'value1', b'value2']
def test_records_limit(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x, limit=5) assert records.count() == 5
def test_scan_with_prefix(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['prefix1/dir1/subdir1/key1'] = 'value1' store.store['prefix2/dir2/another-dir/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], clauses={'dim1': lambda x: x == 'dir1'}, store=store) with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan(['dim1', 'dim2',], ['prefix1/',], dataset.clauses, executor) assert list(folders) == ['prefix1/dir1/']
def test_records_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1) records = dataset.records(spark_context, decode=lambda x: x, summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}]) records = records.collect() assert records == [b'value1']
def test_summaries_with_limit(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1) summaries = dataset.summaries(spark_context, 1) assert len(summaries) == 1 assert summaries[0]['key'] in store.store
def test_scan_no_clause(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) key = 'dir1/dir2/key1' value = 'value1' store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan(['dim1', 'subdir'], ['prefix'], {}, executor) assert list(folders) == ['prefix']
def test_scan_with_clause(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/another-dir/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], clauses={'dim1': lambda x: x == 'dir1'}, store=store) with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan(['dim1', 'dim2'], [''], dataset.clauses, executor) assert list(folders) == ['dir1/']
def test_summaries_with_limit(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = list(dataset._summaries(1)) assert len(summaries) == 1 assert summaries[0]['key'] in store.store
def test_summaries_with_limit(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = dataset.summaries(spark_context, 1) assert len(summaries) == 1 assert summaries[0]['key'] in store.store
def test_records_print_output(spark_context, capsys): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) dataset.records(spark_context, decode=lambda x: x) out, err = capsys.readouterr() assert out.rstrip() == "fetching 0.00066MB in 100 files..."
def test_records_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x, summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}]) records = records.collect() assert records == ['value1']
def test_records_print_output(spark_context, capsys): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100+1): key = 'dir{}/subdir{}/key{}'.format(*[i]*3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) dataset.records(spark_context, decode=lambda x: x) out, err = capsys.readouterr() assert out.rstrip() == "fetching 0.00066MB in 100 files..."
def test_scan_no_clause(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) key = 'dir1/dir2/key1' value = 'value1' store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan(['dim1', 'subdir'], ['prefix'], {}, executor) assert list(folders) == ['prefix']
def test_records_object(spark_context): expect = {"uid": 1} bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['key'] = json.dumps(expect) ds = Dataset(bucket_name, None, store=store, max_concurrency=1) row = ds.records(spark_context, decode=decode).first() assert isinstance(row, dict) assert row == expect
def test_sanitized_dimensions(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir_1/subdir1/key1'] = 'value1' store.store['dir_1/subdir2/key2'] = 'value2' store.store['dir_2/subdir3/key3'] = 'value3' store.store['dir_3/subdir4/key4'] = 'value4' dataset = (Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1) .where(dim1="dir-1")) summaries = dataset.summaries(spark_context) assert len(summaries) == 2
def test_records_selection(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) key = 'dir1/subdir1/key1' value = '{"a": {"b": { "c": "value"}}}' store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c') records = dataset.records(spark_context, decode=decode) assert records.collect() == [{'field': 'value'}] # Check that concatenating `select`s works as expected records = dataset.select(field2='a.b').records(spark_context, decode=decode) assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
def test_records_selection(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) key = 'dir1/subdir1/key1' value = '{"a": {"b": { "c": "value"}}}' store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c') records = dataset.records(spark_context, decode=decode) assert records.collect() == [{'field': 'value'}] # Check that concatenating `select`s works as expected records = dataset.select(field2='a.b').records(spark_context, decode=decode) assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
def test_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = dataset.summaries(spark_context) assert len(summaries) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']])
def test_summaries(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = dataset._summaries() assert len(list(summaries)) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']])
def test_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1) summaries = dataset.summaries(spark_context) assert len(summaries) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']])
def test_prefix_slash(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['a/b/dir1/subdir1/key1'] = 'value1' store.store['a/b/dir2/subdir2/key2'] = 'value2' store.store['x/b/dir3/subdir3/key3'] = 'value3' store.store['a/c/dir4/subdir4/key4'] = 'value4' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, prefix='a/b', max_concurrency=1) summaries = dataset.summaries(spark_context) assert len(summaries) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']]) # be sure "where" still works summaries_filtered = dataset.where(dim1='dir1').summaries(spark_context) assert len(summaries_filtered) == 1 assert summaries_filtered[0]['key'] == 'a/b/dir1/subdir1/key1'
def test_store(monkeypatch, data_dir): with open(os.path.join(data_dir, 'schema.json')) as s: schema = json.loads(s.read()) dimensions = [f['field_name'] for f in schema['dimensions']] dataset = Dataset('test-bucket', dimensions, InMemoryStore('test-bucket')) @staticmethod def from_source(source_name): return dataset monkeypatch.setattr(Dataset, 'from_source', from_source) return dataset.store
def test_records_many_groups(spark_context, monkeypatch): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, spark_context.defaultParallelism + 2): store.store['dir1/subdir1/key{}'.format(i)] = 'value{}'.format(i) # create one group per item monkeypatch.setattr(moztelemetry.dataset, '_group_by_size_greedy', lambda x, _: [[y] for y in x]) dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x) records = records.collect() assert records == ['value{}'.format(i) for i in range(1, spark_context.defaultParallelism + 2)]
def test_records_sample(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect() assert len(records_1) == 10 records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect() # The sampling seed is different, so we have two different samples. assert sorted(records_1) != sorted(records_2) records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect() records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect() # Same seed, same sample. assert sorted(records_1) == sorted(records_2)
def test_sanitized_dimensions(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir_1/subdir1/key1'] = 'value1' store.store['dir_1/subdir2/key2'] = 'value2' store.store['dir_2/subdir3/key3'] = 'value3' store.store['dir_3/subdir4/key4'] = 'value4' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1="dir-1") summaries = dataset.summaries(spark_context) assert len(summaries) == 2
def test_prefix_slash(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['a/b/dir1/subdir1/key1'] = 'value1' store.store['a/b/dir2/subdir2/key2'] = 'value2' store.store['x/b/dir3/subdir3/key3'] = 'value3' store.store['a/c/dir4/subdir4/key4'] = 'value4' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, prefix='a/b') summaries = dataset.summaries(spark_context) assert len(summaries) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']]) # be sure "where" still works summaries_filtered = dataset.where(dim1='dir1').summaries(spark_context) assert len(summaries_filtered) == 1 assert summaries_filtered[0]['key'] == 'a/b/dir1/subdir1/key1'