def test_basic_crossjoin_no_conflicts(self): left_data = [{ 'id': 1, 'property_1': 'hello' }, { 'id': 2, 'property_1': 'world' }] right_data = [{ 'name': 1, 'property_2': 'bye' }, { 'name': 2, 'property_2': 'moon' }] left = _op.JSONScan(object_payload=left_data) right = _op.JSONScan(object_payload=right_data) cj = _op.CrossJoin(left, right) tuples = list(cj) self.assertSequenceEqual( sorted(list(left_data[0].keys()) + list(right_data[0].keys())), sorted(tuples[0].keys()), 'expected cross join tuple to contain all columns from both left and right tuples' ) self.assertEqual( len(tuples), len(left_data) * len(right_data), 'expected cross join to contain left * right number of tuples')
def test_near_matches(self): left_data = [{ 'id': 1, 'species': 'Mus musculus' }, { 'id': 2, 'species': 'Danio rerio' }, { 'id': 3, 'species': 'mus musculus' }, { 'id': 4, 'species': 'danio rerio' }, { 'id': 5, 'species': 'Zebrafish' }, { 'id': 6, 'species': 'Mouse' }, { 'id': 7, 'species': 'Zbrafish' }, { 'id': 8, 'species': 'Muse' }] right_data = [{ 'ID': 1, 'name': 'Mus musculus', 'synonyms': ['Mouse'] }, { 'ID': 2, 'name': 'Danio rerio', 'synonyms': ['Zebrafish'] }] left = _op.JSONScan(object_payload=left_data) right = _op.JSONScan(object_payload=right_data) condition = _opt.Similar('species', 'name', 'synonyms', _util.edit_distance_fn, None) sj = _op.NestedLoopsSimilarityJoin(left, right, condition) tuples = list(sj) logger.debug(tuples) self.assertSequenceEqual( sorted(list(left_data[0].keys()) + list(right_data[0].keys())), sorted(tuples[0].keys()), 'expected join to contain all columns from both left and right tuples' ) self.assertEqual(len(tuples), len(left_data), "expected join to contain left's number of tuples")
def test_basic_crossjoin_w_conflicts(self): left_data = [{ 'id': 1, 'foo': 'a', 'property_1': 'hello' }, { 'id': 2, 'foo': 'b', 'property_1': 'world' }] right_data = [{ 'name': 1, 'foo': 'x', 'property_2': 'bye' }, { 'name': 2, 'foo': 'y', 'property_2': 'moon' }] conflict = 'foo' left = _op.JSONScan(object_payload=left_data) right = _op.JSONScan(object_payload=right_data) cj = _op.CrossJoin(left, right) tuples = list(cj) self.assertEqual( len(tuples[0].keys()), len(left_data[0].keys()) + len(right_data[0].keys()), 'expected the cross join columns to be as many of sum of left and right columns' ) self.assertEqual( len(tuples), len(left_data) * len(right_data), 'expected cross join to contain left * right number of tuples') non_conflicting_columns = [ k for k in left_data[0] if k != conflict ] + [k for k in right_data[0] if k != conflict] for k in non_conflicting_columns: self.assertIn( k, tuples[0], 'expected non-conflicting column "%s" in cross join tuple' % k) self.assertEqual( len([k for k in tuples[0] if k.endswith(conflict)]), 2, 'expected two variants of conflicting column "%s" in cross join tuple' % conflict)
def test_grouping_single_attr_no_nesting_w_distinct(self): self.data = self.generate_duplicate_data(self.data, 100) child = _op.JSONScan(object_payload=self.data) child = _op.HashDistinct(child, ('name', )) # inject a distinct sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ), tuple(), _util.edit_distance_fn, None) tuples = list(sa) self.assertEqual(len(tuples), 2, "expected 2 groups/tuples")
def test_grouping_single_attr_no_nesting(self): child = _op.JSONScan(object_payload=self.data) sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ), tuple(), _util.edit_distance_fn, None) tuples = list(sa) logger.debug(tuples) logger.debug(sa.description) self.assertEqual(len(tuples), 2, "expected 2 groups/tuples")
def test_grouping_and_nesting_single_attrs(self): for datum in self.data: # extend raw data with synonyms datum['synonyms'] = datum['name'] child = _op.JSONScan(object_payload=self.data) sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ), ('synonyms', ), _util.edit_distance_fn, None) tuples = list(sa) logger.debug(tuples) logger.debug(sa.description) self.assertEqual(len(tuples), 2, "expected 2 groups/tuples") self.assertEqual(len(self.data), sum([len(t['synonyms']) for t in tuples]), "expected all synonyms to be nested")
def test_grouping_and_nesting_single_attrs_w_distinct(self): # generate test data for datum in self.data: # extend raw data with synonyms datum['synonyms'] = datum['name'] multiplier = 100 self.data = self.generate_duplicate_data(self.data, multiplier) # create physical plan child = _op.JSONScan(object_payload=self.data) child = _op.HashDistinct(child, ('name', 'synonyms')) # inject a distinct sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ), ('synonyms', ), _util.edit_distance_fn, None) tuples = list(sa) # assertions self.assertEqual(len(tuples), 2, "expected 2 groups/tuples") self.assertEqual( len(self.data) / multiplier, sum([len(t['synonyms']) for t in tuples]), "expected all synonyms to be nested")
class TestSelect (unittest.TestCase): """Basic tests for Select operator.""" _test_helper = TestHelper() _child = _op.JSONScan(object_payload=_test_helper.test_data) def test_select_eq_on_field_0(self): comparison = _opt.Comparison(self._test_helper.FIELDS[0], 'eq', 0) oper = _op.Select(self._child, comparison) self.assertDictEqual(self._child.description, oper.description, "table definition should match source") self.assertEqual(1, count(oper), 'incorrect number of rows returned by operator') def test_select_eq_on_field_1(self): comparison = _opt.Comparison(self._test_helper.FIELDS[1], 'eq', self._test_helper.test_data[0][self._test_helper.FIELDS[1]]) oper = _op.Select(self._child, comparison) self.assertDictEqual(self._child.description, oper.description, "table definition should match source") self.assertLess(1, count(oper), 'incorrect number of rows returned by operator') def test_select_conjunction(self): comparisons = [ _opt.Comparison(self._test_helper.FIELDS[0], 'eq', 0), _opt.Comparison(self._test_helper.FIELDS[1], 'eq', self._test_helper.test_data[0][self._test_helper.FIELDS[1]]) ] comparison = _opt.Conjunction(comparisons) oper = _op.Select(self._child, comparison) self.assertDictEqual(self._child.description, oper.description, "table definition should match source") self.assertEqual(1, count(oper), 'incorrect number of rows returned by operator') def test_select_disjunction(self): assert self._test_helper.num_test_rows > 2 i = int(self._test_helper.num_test_rows / 2) comparisons = [ _opt.Comparison(self._test_helper.FIELDS[0], 'lt', i), _opt.Comparison(self._test_helper.FIELDS[0], 'gt', i) ] comparison = _opt.Disjunction(comparisons) oper = _op.Select(self._child, comparison) self.assertDictEqual(self._child.description, oper.description, "table definition should match source") self.assertEqual(self._test_helper.num_test_rows-1, count(oper), 'incorrect number of rows returned by operator')
def setUp(self): self._op = _op.JSONScan(object_payload=payload)
class TestProjection(unittest.TestCase): """Basic tests for Project operator.""" _child = _op.JSONScan(object_payload=payload) def test_simple_projection_description(self): projection = ('property_1', ) oper = _op.Project(self._child, projection) desc = oper.description self.assertIsNotNone(desc, 'description is None') self.assertIsNotNone(desc['column_definitions'], 'column_definitions is None') self.assertEqual(len(desc['column_definitions']), len(projection), 'incorrect number of columns in description') def test_simple_projection_iter(self): projection = ('property_1', ) oper = _op.Project(self._child, projection) it = iter(oper) self.assertIsNotNone(it, 'must return an iterable') rows = list(it) self.assertEqual(len(rows), len(payload), 'did not return correct number of rows') self.assertTrue(isinstance(rows[0], dict), 'row is not a dictionary') self.assertEqual(len(rows[0].keys()), len(projection), 'did not project correct number of attributes') def test_project_all_attributes(self): projection = (_opt.AllAttributes(), ) oper = _op.Project(self._child, projection) desc = oper.description self.assertEqual(len(desc['column_definitions']), len(payload[0].keys()), 'did not project all attributes') def test_project_and_rename_same_attribute_twice(self): renames = (_opt.AttributeAlias(name='property_1', alias='name'), _opt.AttributeAlias(name='property_1', alias='synonyms')) projection = _op.Project(self._child, renames) tup = list(projection)[0] self.assertIn('name', tup) self.assertIn('synonyms', tup) self.assertNotIn('RID', tup) cnames = [ column['name'] for column in projection.description['column_definitions'] ] logger.debug(cnames) for expected in ['name', 'synonyms']: self.assertIn( expected, cnames, "column missing in projected relation's description") def test_project_introspect_RID(self): projection = (_opt.IntrospectionFunction(_util.introspect_key_fn), 'property_1') oper = _op.Project(self._child, projection) renamed_rid = self._child.description['table_name'] + "_RID" self.assertTrue( any([ c['name'] == renamed_rid for c in oper.description['column_definitions'] ]), "'RID' not renamed to '%s'" % renamed_rid) def test_project_preserve_unique_on_rid(self): oper = _op.Project(self._child, ('RID', )) self.assertTrue( any([ len(colset) == 1 and colset[0] == 'RID' for colset in [key['unique_columns'] for key in oper.description['keys']] ]), 'could not find a key defined on (RID) when RID was projected from child relation' )