def test_timespan_too_big(random_metadata): url = 's3://foo/blapp' random_metadata['start'] = 0 random_metadata['end'] = (DatalakeRecord.MAXIMUM_BUCKET_SPAN + 1) * \ DatalakeRecord.TIME_BUCKET_SIZE_IN_MS with pytest.raises(UnsupportedTimeRange): DatalakeRecord.list_from_metadata(url, random_metadata)
def test_timespan_too_big(s3_file_from_metadata, random_metadata): url = 's3://foo/blapp' s3_file_from_metadata(url, random_metadata) random_metadata['start'] = 0 random_metadata['end'] = (DatalakeRecord.MAXIMUM_BUCKET_SPAN + 1) * \ DatalakeRecord.TIME_BUCKET_SIZE_IN_MS with pytest.raises(UnsupportedTimeRange): DatalakeRecord.list_from_metadata(url, random_metadata)
def maker(**kwargs): m = random_metadata() m.update(**kwargs) key = '/'.join([str(v) for v in kwargs.values()]) url = 's3://datalake-test/' + key s3_file_from_metadata(url, m) return DatalakeRecord.list_from_metadata(url, m)
def test_get_time_buckets_misaligned(): # Test for regression on bug when querying over x buckets for a timeframe # (end - start) of < x buckets (i.e. end of B0 to start of B2) start = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 4 / 5 end = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 11 / 5 buckets = DatalakeRecord.get_time_buckets(start, end) assert buckets == [0, 1, 2]
def test_list_from_s3_url(s3_file_from_metadata, random_metadata): url = 's3://foo/bar' s3_file_from_metadata(url, random_metadata) records = DatalakeRecord.list_from_url(url) assert len(records) >= 1 for r in records: assert r['metadata'] == random_metadata
def test_list_from_metadata(s3_file_from_metadata, random_metadata): url = 's3://foo/baz' s3_file_from_metadata(url, random_metadata) records = DatalakeRecord.list_from_metadata(url, random_metadata) assert len(records) >= 1 for r in records: assert r['metadata'] == random_metadata
def test_no_end(random_metadata): url = 's3://foo/baz' del(random_metadata['end']) records = DatalakeRecord.list_from_metadata(url, random_metadata) assert len(records) >= 1 for r in records: assert r['metadata'] == random_metadata
def test_no_end_exclusion(table_maker, querier): m = random_metadata() del(m['end']) url = 's3://datalake-test/' + m['id'] records = DatalakeRecord.list_from_metadata(url, m) table_maker(records) results = querier.query_by_time(m['start'] + 1, m['start'] + 2, m['what']) assert len(results) == 0
def test_no_end_exclusion(table_maker, querier, s3_file_from_metadata): m = random_metadata() del (m['end']) url = 's3://datalake-test/' + m['id'] s3_file_from_metadata(url, m) records = DatalakeRecord.list_from_metadata(url, m) table_maker(records) results = querier.query_by_time(m['start'] + 1, m['start'] + 2, m['what']) assert len(results) == 0
def test_no_end(random_metadata, s3_file_from_metadata): url = 's3://foo/baz' del (random_metadata['end']) expected_metadata = random_metadata.copy() expected_metadata['end'] = None s3_file_from_metadata(url, random_metadata) records = DatalakeRecord.list_from_metadata(url, random_metadata) assert len(records) >= 1 for r in records: assert r['metadata'] == expected_metadata
def query_by_time(self, start, end, what, where=None, cursor=None): results = [] buckets = DatalakeRecord.get_time_buckets(start, end) if cursor: current_bucket = cursor['current_time_bucket'] i = buckets.index(current_bucket) buckets = buckets[i:] for b in buckets: cursor = self._query_time_bucket(b, results, start, end, what, where, cursor) return QueryResults(results, cursor)
def test_record_size_and_create_time(s3_file_maker, random_metadata): url = 's3://foo/bar' now = int(time.time() * 1000.0) # s3 create times have a 1s resolution. So we just tolerate 2x that to # ensure the test passes reasonably. max_tolerable_delta = 2000 s3_file_maker('foo', 'bar', 'thissongisjust23byteslong', random_metadata) records = DatalakeRecord.list_from_url(url) assert len(records) >= 1 for r in records: assert r['metadata'] == random_metadata assert abs(r['create_time'] - now) <= max_tolerable_delta assert r['size'] == 25
def test_null_end(table_maker, querier): m = { "start": 1461023640000, "what": "file", "version": 0, "end": None, "work_id": None, "path": "/home/foo/file", "where": "somehost", "id": "fedcba09876543210", "hash": "0123456789abcdef" } url = 's3://datalake-test/' + m['id'] records = DatalakeRecord.list_from_metadata(url, m) table_maker(records) results = querier.query_by_time(1461023630000, 1461023650000, 'file') assert len(results) == 1
def query_by_time(self, start, end, what, where=None, cursor=None): results = [] buckets = DatalakeRecord.get_time_buckets(start, end) if cursor: current_bucket = cursor['current_time_bucket'] i = buckets.index(current_bucket) buckets = buckets[i:] for b in buckets: cursor = self._query_time_bucket(b, results, start, end, what, where, cursor) if cursor and \ cursor.current_time_bucket and \ cursor.current_time_bucket > buckets[-1]: # this is a corner case. It means that the next query would take us # into the next bucket, but the next bucket is beyond the time of # interest. Just clear the cursor in this case. cursor = None return QueryResults(results, cursor)
def ingest(self, url): '''ingest the metadata associated with the given url''' records = DatalakeRecord.list_from_url(url) for r in records: self.storage.store(r)
def datalake_records(self): if self['eventName'] not in self.EVENTS_WITH_RECORDS: return [] return [dlr for dlr in DatalakeRecord.list_from_url(self.s3_url)]
def test_no_such_datalake_file_in_bucket(s3_bucket_maker): s3_bucket_maker('test-bucket') url = 's3://test-bucket/such/file' with pytest.raises(NoSuchDatalakeFile): DatalakeRecord.list_from_url(url)
def test_no_such_bucket(s3_connection): url = 's3://no/such/file' with pytest.raises(NoSuchDatalakeFile): DatalakeRecord.list_from_url(url)
def test_from_url_fails_without_boto(): with pytest.raises(InsufficientConfiguration): DatalakeRecord.list_from_url('s3://foo/bar')
def test_no_metadata(s3_file_maker): url = 's3://foo/bar' s3_file_maker('foo', 'bar', 'the content', None) with pytest.raises(InvalidDatalakeMetadata): DatalakeRecord.list_from_url(url)
def maker(content, metadata): path = metadata['id'] + '/data' s3_file_maker('datalake-test', path, content, metadata) url = 's3://datalake-test/' + path records = DatalakeRecord.list_from_metadata(url, metadata) table_maker(records)
def create_test_records(bucket='datalake-test', **kwargs): m = random_metadata() m.update(**kwargs) url = 's3://' + bucket + '/' + '/'.join([str(v) for v in kwargs.values()]) return DatalakeRecord.list_from_metadata(url, m)