def test_unaligned_multibucket_queries(table_maker, querier): records = [] # Create 5 records spanning 3 buckets, of which we want the middle 3 records += create_test_records( start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*1/4, end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*1/4+1, what='foo') records += create_test_records( start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*3/4, end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*3/4+1, what='foo') records += create_test_records( start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*6/4, end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*6/4+1, what='foo') records += create_test_records( start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*9/4, end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*9/4+1, what='foo') records += create_test_records( start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*11/4, end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*11/4+1, what='foo') table_maker(records) start = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 3 / 4 end = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 9 / 4 results = get_page(querier.query_by_time, [start, end, 'foo']) evaluate_time_based_results(results, 3)
def test_latest_happened_yesterday(table_maker, querier): yesterday = int(time.time() * 1000) - _ONE_DAY_MS records = create_test_records(start=yesterday, end=None, what='tower', where='pisa') table_maker(records) result = querier.query_latest('tower', 'pisa') _validate_latest_result(result, what='tower', where='pisa')
def test_deduplicating_work_id_records(table_maker, querier): start = YEAR_2010 end = YEAR_2010 + 2 * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS records = create_test_records(start=start, end=end, what='foo', work_id='job0') table_maker(records) results = querier.query_by_work_id('job0', 'foo') assert len(results) == 1
def test_paginate_work_id_records(table_maker, querier): records = [] for i in range(150): records += create_test_records(what='foo', work_id='job0', start=1456833600000, end=1456837200000) table_maker(records) results = get_multiple_pages(querier.query_by_work_id, ['job0', 'foo']) assert len(results) == 150
def test_query_by_time(table_maker, querier): records = [] for start in range(YEAR_2010, YEAR_2010+100, 10): end = start + 9 records += create_test_records(start=start, end=end, what='foo') table_maker(records) results = querier.query_by_time(YEAR_2010, YEAR_2010+9, 'foo') assert len(results) == 1 assert all_results_between(results, YEAR_2010, YEAR_2010+9)
def test_query_by_work_id(table_maker, querier): records = [] for i in range(2): work_id = 'work{}'.format(i) records += create_test_records(work_id=work_id, what='foo') table_maker(records) results = querier.query_by_work_id('work0', 'foo') assert len(results) == 1 assert all_results(results, work_id='work0')
def test_deduplicating_time_records(table_maker, querier): # Create a record that definitively spans two time buckets, and make sure # that we only get one record back when we query for it. start = YEAR_2010 two_buckets = 2 * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS end = YEAR_2010 + two_buckets records = create_test_records(start=start, end=end, what='foo') table_maker(records) results = querier.query_by_time(start, end+two_buckets, 'foo') assert len(results) == 1
def test_paginate_many_records_single_time_bucket(table_maker, querier): records = [] interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS/150 very_end = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS for start in range(YEAR_2010, very_end, interval): end = start + interval records += create_test_records(start=start, end=end, what='foo') table_maker(records) results = get_multiple_pages( querier.query_by_time, [YEAR_2010, very_end, 'foo']) evaluate_time_based_results(results, 150)
def test_paginate_few_records_single_bucket_no_empty_page(table_maker, querier): records = [] # Fill one bucket with 2x MAX_RESULTS, # but we only want the last record. interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS / MAX_RESULTS / 2 very_end = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS for start in range(YEAR_2010, very_end, interval): end = start + interval records += create_test_records(start=start, end=end, what='foo') table_maker(records) results = get_page(querier.query_by_time, [very_end - interval + 1, very_end, 'foo']) evaluate_time_based_results(results, 1)
def test_query_by_time_with_where(table_maker, querier): records = [] for i in range(4): where = 'worker{}'.format(i) records += create_test_records(start=YEAR_2010, end=YEAR_2010+10, what='foo', where=where) table_maker(records) results = querier.query_by_time(YEAR_2010, YEAR_2010+10, 'foo', where='worker2') assert len(results) == 1 assert all_results(results, start=YEAR_2010, end=YEAR_2010+10, where='worker2') assert all_results_between(results, YEAR_2010, YEAR_2010+10)
def test_latest_many_records_single_time_bucket(table_maker, querier): now = int(time.time() * 1000) records = [] bucket = now/DatalakeRecord.TIME_BUCKET_SIZE_IN_MS start = bucket * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS/150 very_end = start + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS last_start = very_end - interval for t in range(start, very_end, interval): end = t + interval records += create_test_records(start=t, end=end, what='meow', where='tree') table_maker(records) result = querier.query_latest('meow', 'tree') _validate_latest_result(result, what='meow', where='tree', start=last_start)
def maker(content, metadata): path = metadata['id'] + '/data' s3_file_maker('datalake-test', path, content, metadata) records = create_test_records(**metadata) table_maker(records)
def test_latest_happened_today(table_maker, querier): now = int(time.time() * 1000) records = create_test_records(start=now, end=None, what='foo', where='boo') table_maker(records) result = querier.query_latest('foo', 'boo') _validate_latest_result(result, what='foo', where='boo')