示例#1
0
def test_timespan_too_big(s3_file_from_metadata, random_metadata):
    url = 's3://foo/blapp'
    s3_file_from_metadata(url, random_metadata)
    random_metadata['start'] = 0
    random_metadata['end'] = (DatalakeRecord.MAXIMUM_BUCKET_SPAN + 1) * \
        DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    with pytest.raises(UnsupportedTimeRange):
        DatalakeRecord.list_from_metadata(url, random_metadata)
示例#2
0
 def maker(**kwargs):
     m = random_metadata()
     m.update(**kwargs)
     key = '/'.join([str(v) for v in kwargs.values()])
     url = 's3://datalake-test/' + key
     s3_file_from_metadata(url, m)
     return DatalakeRecord.list_from_metadata(url, m)
示例#3
0
def test_get_time_buckets_misaligned():
    # Test for regression on bug when querying over x buckets for a timeframe
    # (end - start) of < x buckets (i.e. end of B0 to start of B2)
    start = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 4 / 5
    end = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 11 / 5
    buckets = DatalakeRecord.get_time_buckets(start, end)
    assert buckets == [0, 1, 2]
示例#4
0
def test_list_from_metadata(s3_file_from_metadata, random_metadata):
    url = 's3://foo/baz'
    s3_file_from_metadata(url, random_metadata)
    records = DatalakeRecord.list_from_metadata(url, random_metadata)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == random_metadata
示例#5
0
def test_no_end_exclusion(table_maker, querier, s3_file_from_metadata):
    m = random_metadata()
    del (m['end'])
    url = 's3://datalake-test/' + m['id']
    s3_file_from_metadata(url, m)
    records = DatalakeRecord.list_from_metadata(url, m)
    table_maker(records)
    results = querier.query_by_time(m['start'] + 1, m['start'] + 2, m['what'])
    assert len(results) == 0
示例#6
0
def test_no_end(random_metadata, s3_file_from_metadata):
    url = 's3://foo/baz'
    del (random_metadata['end'])
    expected_metadata = random_metadata.copy()
    expected_metadata['end'] = None
    s3_file_from_metadata(url, random_metadata)
    records = DatalakeRecord.list_from_metadata(url, random_metadata)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == expected_metadata
def test_no_end(table_maker, querier, s3_file_from_metadata):
    m = generate_random_metadata()
    del(m['end'])
    url = 's3://datalake-test/' + m['id']
    s3_file_from_metadata(url, m)
    records = DatalakeRecord.list_from_metadata(url, m)
    table_maker(records)
    results = querier.query_by_time(m['start'], m['start'] + 1, m['what'])
    assert len(results) == 1
    assert results[0]['metadata']['end'] is None
示例#8
0
def test_record_size_and_create_time(s3_file_maker, random_metadata):
    url = 's3://foo/bar'
    now = int(time.time() * 1000.0)

    # s3 create times have a 1s resolution. So we just tolerate 2x that to
    # ensure the test passes reasonably.
    max_tolerable_delta = 2000

    s3_file_maker('foo', 'bar', 'thissongisjust23byteslong', random_metadata)
    records = DatalakeRecord.list_from_url(url)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == random_metadata
        assert abs(r['create_time'] - now) <= max_tolerable_delta
        assert r['size'] == 25
示例#9
0
    def query_by_time(self, start, end, what, where=None, cursor=None):
        results = []
        buckets = DatalakeRecord.get_time_buckets(start, end)

        if cursor:
            current_bucket = cursor['current_time_bucket']
            i = buckets.index(current_bucket)
            buckets = buckets[i:]

        for b in buckets:
            cursor = self._query_time_bucket(b, results, start, end, what,
                                             where, cursor)

        if cursor and \
           cursor.current_time_bucket and \
           cursor.current_time_bucket > buckets[-1]:
            # this is a corner case. It means that the next query would take us
            # into the next bucket, but the next bucket is beyond the time of
            # interest. Just clear the cursor in this case.
            cursor = None

        return QueryResults(results, cursor)
示例#10
0
 def maker(content, metadata):
     path = metadata['id'] + '/data'
     s3_file_maker('datalake-test', path, content, metadata)
     url = 's3://datalake-test/' + path
     records = DatalakeRecord.list_from_metadata(url, metadata)
     table_maker(records)
示例#11
0
 def datalake_records(self):
     if self['eventName'] not in self.EVENTS_WITH_RECORDS:
         return []
     return [dlr for dlr in DatalakeRecord.list_from_url(self.s3_url)]
示例#12
0
def test_no_metadata(s3_file_maker):
    url = 's3://foo/bar'
    s3_file_maker('foo', 'bar', 'the content', None)
    with pytest.raises(InvalidDatalakeMetadata):
        DatalakeRecord.list_from_url(url)
示例#13
0
def test_no_such_bucket(s3_connection):
    url = 's3://no/such/file'
    with pytest.raises(NoSuchDatalakeFile):
        DatalakeRecord.list_from_url(url)
示例#14
0
def test_no_such_datalake_file_in_bucket(s3_bucket_maker):
    s3_bucket_maker('test-bucket')
    url = 's3://test-bucket/such/file'
    with pytest.raises(NoSuchDatalakeFile):
        DatalakeRecord.list_from_url(url)
示例#15
0
def test_from_url_fails_without_boto():
    with pytest.raises(InsufficientConfiguration):
        DatalakeRecord.list_from_url('s3://foo/bar')
示例#16
0
 def ingest(self, url):
     '''ingest the metadata associated with the given url'''
     records = DatalakeRecord.list_from_url(url)
     for r in records:
         self.storage.store(r)