def test_empty_stream(self): g = GroupByStatement(max_num_files=10, max_hashmap_entries=1000, request_id="test_empty_stream") data = IncrementalKeyValueIterator(0, 1, 0) result = g.groupBy(data) self.assertEqual(g.spills, 0) self.assertEqual(result.hasNext(), False)
def test_stream_spills_on_disk(self): g = GroupByStatement(max_num_files=4, max_hashmap_entries=300, request_id="test_stream_spills_on_disk") data = IncrementalKeyValueIterator(1000, 10, 7) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertEqual(g.spills, 4) self.compare_outputs(data_copy, result_iterator)
def test_low_memory(self): g = GroupByStatement(max_memory=1024, request_id="test_low_memory") data = IncrementalKeyValueIterator(1000, 10, 7) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertTrue(g.spills > 0) self.assertTrue(g.num_merge_stages > 0) self.assertTrue(g._num_files <= 1000) self.compare_outputs(data_copy, result_iterator)
def test_large_stream(self): g = GroupByStatement(max_num_files=100, max_hashmap_entries=10000, request_id="test_large_stream") data = IncrementalKeyValueIterator(200000, 10, 7, 3, 2) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertEqual(g.spills, 20) self.assertEqual(g._num_files, 20) self.compare_outputs(data_copy, result_iterator)
def test_stream_spills_on_disk_and_file_merges_required(self): g = GroupByStatement(max_num_files=2, max_hashmap_entries=100, request_id="test_stream_spills_on_disk_and_file_merges_required") data = IncrementalKeyValueIterator(1000, 10, 7) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertEqual(g.spills, 10) self.assertEqual(g.num_merge_stages, 3) self.assertEqual(g._num_files, 2) self.compare_outputs(data_copy, result_iterator)
def test_consecutive_calls(self): g = GroupByStatement(max_num_files=2, max_hashmap_entries=1) result_iterator_list = [] request_id_list = [] for request_id in range(10): data = IncrementalKeyValueIterator(10, 3, 3) result_iterator_list.append(g.groupBy(data)) request_id_list.append(g._request_id) for index in range(10): # Exhaust iterator for key, value in result_iterator_list[index]: pass self.assertFalse(os.path.isdir(request_id_list[index]))