def test_cross_join(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, JoinOp(LoadOp('employees'), LoadOp('employees'))) evaluate = compile(q) eq_(len(list(evaluate(dict(dataset=dataset)))), 9)
def test_replace_views(): dataset = DataSet() adapter = dataset.add_adapter(MockAdapter()) no_managers = SelectionOp( LoadOp('bogus'), IsOp(Var('manager_id'), NullConst()) ) dataset.create_view( 'no_managers', no_managers ) view = AliasOp('no_managers', no_managers) compare( replace_views(LoadOp('no_managers'), dataset), view ) compare( replace_views( JoinOp( LoadOp('no_managers'), LoadOp('no_managers') ), dataset ), JoinOp( view, view ) )
def test_function_in_from(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp( Function('flatten', LoadOp('employees'), StringConst('roles')), Var('manager_id'), Var('roles'))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [(1234, 'sales'), (1234, 'marketing')]) q = Query(dataset, Function('flatten', LoadOp('employees'), StringConst('roles'))) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [[8901, 'Mark Markty', date(2010, 3, 1), 1234, 'sales'], [8901, 'Mark Markty', date(2010, 3, 1), 1234, 'marketing']])
def test_selection(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, SelectionOp(LoadOp('employees'), EqOp(Var('manager_id'), NullConst()))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [ (1234, 'Tom Tompson', date(2009, 1, 17), None, ()), ]) q = Query( dataset, SelectionOp(LoadOp('employees'), NotOp(EqOp(Var('manager_id'), NullConst())))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [(4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing'))])
def test_self_join_with_projection(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp( JoinOp( AliasOp('manager', LoadOp('employees')), AliasOp('employee', LoadOp('employees')), EqOp(Var('manager.employee_id'), Var('employee.manager_id')) ), SelectAllExpr('employee'), RenameOp('manager', Var('manager.full_name')) ) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (4567, 'Sally Sanders', date(2010, 2, 24), 1234, (), 'Tom Tompson'), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing'), 'Tom Tompson') ] )
def test_decorator_function_calls(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) @dataset.function(returns=dict(name="initials", type="STRING")) def initials(name): if name: return ''.join([p[0] for p in name.split()]) else: return None q = Query( dataset, ProjectionOp(LoadOp('employees'), Function('initials', Var('full_name'))) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ ('TT',), ('SS',), ('MM',), ] )
def test_relpace_views(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) no_managers = SelectionOp( LoadOp('employees'), IsOp(Var('manager_id'), NullConst()) ) dataset.create_view( 'no_managers', no_managers ) eq_( replace_views(LoadOp('no_managers'), dataset), no_managers ) eq_( replace_views( JoinOp( LoadOp('no_managers'), LoadOp('no_managers') ), dataset ), JoinOp( no_managers, no_managers ) )
def test_query_builder(): dataset = DataSet() adapter = MockAdapter() dataset.add_adapter(adapter) query = dataset.select('x') eq_(isinstance(query, QueryBuilder), True) eq_(query.dataset, dataset) eq_(query.column_exps, 'x')
def test_aggregation_whole_table(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, ProjectionOp(LoadOp('employees'), Function('count'))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [ (3, ), ])
def test_paramgetter(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, ProjectionOp(LoadOp(''), ParamGetterOp(0))) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset, params=('foo', )))), [('foo', )])
def test_limit(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, SliceOp(LoadOp('employees'), 1)) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [ (1234, 'Tom Tompson', date(2009, 1, 17), None, ()), ])
def test_offset_and_limit(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, SliceOp(LoadOp('employees'), 1, 2)) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [ (4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), ])
def test_views(): dataset = DataSet() adapter = dataset.add_adapter(MockAdapter()) # create a view off of an existing table dataset.select('x').frm('bogus').create_view('only_x') view = dataset.get_view('only_x') eq_(view, AliasOp('only_x', ProjectionOp(LoadOp('bogus'), Var('x')))) # create a view off of a view dataset.select('x').frm('only_x').create_view('only_x_from_x') view = dataset.get_view('only_x_from_x') compare( view, # Todo: Implement a query optimizer that eliminates # redunant projections ops like the one we see below AliasOp( 'only_x_from_x', ProjectionOp( AliasOp('only_x', ProjectionOp(LoadOp('bogus'), Var('x'))), Var('x'))))
def test_projection(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, ProjectionOp(LoadOp('employees'), Var('full_name'))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [('Tom Tompson', ), ('Sally Sanders', ), ('Mark Markty', )])
def test_offset(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, SliceOp(LoadOp('employees'), 1, None)) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [ (4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing')), ])
def test_order_by_asc(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query(dataset, OrderByOp(LoadOp('employees'), Asc(Var('employee_id')))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [ (1234, 'Tom Tompson', date(2009, 1, 17), None, ()), (4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing')), ])
def test_get_relation(): dataset = DataSet() adapter = MockAdapter() dataset.add_adapter(adapter) s_table = adapter.get_relation('bogus') table = dataset.get_relation('bogus') eq_(table, s_table) assert_sequence_equal(dataset.relations, [('bogus', s_table)])
def test_addition(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp(LoadOp('employees'), AddOp(Var('employee_id'), NumberConst(1)))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [(1235, ), (4568, ), (8902, )])
def test_aggregation_on_column(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, GroupByOp( ProjectionOp(LoadOp('employees'), Var('manager_id'), Function('count')), Var('manager_id'))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [(None, 1), (1234, 2)])
def test_cross_join(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, JoinOp(LoadOp('employees'), LoadOp('employees')) ) evaluate = compile(q) eq_( len(list(evaluate(dict(dataset=dataset)))), 9 )
def test_views(): dataset = DataSet() adapter = dataset.add_adapter(MockAdapter()) # create a view off of an existing table dataset.select('x').frm('bogus').create_view('only_x') view = dataset.get_view('only_x') eq_( view, AliasOp('only_x', ProjectionOp(LoadOp('bogus'), Var('x'))) ) # create a view off of a view dataset.select('x').frm('only_x').create_view('only_x_from_x') view = dataset.get_view('only_x_from_x') compare( view, # Todo: Implement a query optimizer that eliminates # redunant projections ops like the one we see below AliasOp('only_x_from_x', ProjectionOp( AliasOp('only_x',ProjectionOp(LoadOp('bogus'), Var('x'))), Var('x') ) ))
def test_replace_view_within_a_view(): dataset = DataSet() adapter = dataset.add_adapter(MockAdapter()) dataset.create_view( 'view1', LoadOp('bogus') ) dataset.create_view( 'view2', LoadOp('view1') ) dataset.create_view( 'view3', SelectionOp(LoadOp('view2'), IsOp(Var('x'), NullConst())) ) v1 = replace_views( LoadOp('view3'), dataset ) compare( v1, AliasOp('view3', SelectionOp( AliasOp('view2', AliasOp('view1',LoadOp('bogus'))), IsOp(Var('x'), NullConst()) ) ) )
def test_projection(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp(LoadOp('employees'), Var('full_name')) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [('Tom Tompson',), ('Sally Sanders',), ('Mark Markty',)] )
def test_replace_views(): dataset = DataSet() adapter = dataset.add_adapter(MockAdapter()) no_managers = SelectionOp(LoadOp('bogus'), IsOp(Var('manager_id'), NullConst())) dataset.create_view('no_managers', no_managers) view = AliasOp('no_managers', no_managers) compare(replace_views(LoadOp('no_managers'), dataset), view) compare( replace_views(JoinOp(LoadOp('no_managers'), LoadOp('no_managers')), dataset), JoinOp(view, view))
def test_addition(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp(LoadOp('employees'), AddOp(Var('employee_id'), NumberConst(1))) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [(1235,), (4568,), (8902,)] )
def test_get_relation(): dataset = DataSet() adapter = MockAdapter() dataset.add_adapter(adapter) s_table = adapter.get_relation('bogus') table = dataset.get_relation('bogus') eq_(table, s_table) assert_sequence_equal( dataset.relations, [('bogus', s_table)] )
def test_offset_and_limit(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, SliceOp(LoadOp('employees'), 1,2) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), ] )
def test_limit(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, SliceOp(LoadOp('employees'), 1) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (1234, 'Tom Tompson', date(2009, 1, 17), None, ()), ] )
def test_aggregation_whole_table(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp(LoadOp('employees'), Function('count')) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (3,), ] )
def test_paramgetter(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp(LoadOp(''), ParamGetterOp(0)) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset, params=('foo',)))), [ ('foo',) ] )
def test_self_join(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, JoinOp(AliasOp('employee', LoadOp('employees')), AliasOp('manager', LoadOp('employees')), EqOp(Var('manager.employee_id'), Var('employee.manager_id')))) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [(4567, 'Sally Sanders', date(2010, 2, 24), 1234, (), 1234, 'Tom Tompson', date(2009, 1, 17), None, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing'), 1234, 'Tom Tompson', date(2009, 1, 17), None, ())])
def test_offset(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, SliceOp(LoadOp('employees'), 1,None) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing')), ] )
def test_sample_db(self): sample_path = os.path.join(os.path.dirname(__file__), "sample.discodb") dataset = DataSet() server = DiscoDBServer(docs=sample_path) results = list(server.evaluate(LoadOp('docs'))) eq_(len(results), 150)
def test_order_by_asc(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, OrderByOp(LoadOp('employees'), Asc(Var('employee_id'))) ) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))) , [ (1234, 'Tom Tompson', date(2009, 1, 17), None, ()), (4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing')), ] )
def test_discodb_server(self): dataset = DataSet() server = DiscoDBServer(users=self.db) # return all records from the users table results = server.evaluate(LoadOp('users')) assert_sequence_equal(list(results), [ ('John', 2), ('Bob', 1), ])
def test_self_join_with_projection(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp( JoinOp( AliasOp('manager', LoadOp('employees')), AliasOp('employee', LoadOp('employees')), EqOp(Var('manager.employee_id'), Var('employee.manager_id'))), SelectAllExpr('employee'), RenameOp('manager', Var('manager.full_name')))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [(4567, 'Sally Sanders', date(2010, 2, 24), 1234, (), 'Tom Tompson'), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing'), 'Tom Tompson')])
def test_aggregation_on_column(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, GroupByOp( ProjectionOp(LoadOp('employees'), Var('manager_id'), Function('count')), Var('manager_id') ) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (None,1), (1234,2) ] )
def test_get_schema(): class Adapter(object): def has(self, name): return name == 'computed' def evaluate(self, loc): return loc.replace(Function('myschema')) dataset = DataSet() dataset.add_adapter(Adapter()) # Todo: figure out why I have to invoke this decorator here @dataset.function( returns=lambda: Schema([dict(name='field', type='string')])) def myschema(ctx): pass schema = dataset.get_schema('computed') eq_(schema, Schema([dict(name='field', type='string')]))
def test_selection(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, SelectionOp( LoadOp('employees'), EqOp(Var('manager_id'), NullConst()) ) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (1234, 'Tom Tompson', date(2009, 1, 17), None, ()), ] ) q = Query( dataset, SelectionOp( LoadOp('employees'), NotOp(EqOp(Var('manager_id'), NullConst())) ) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (4567, 'Sally Sanders', date(2010, 2, 24), 1234, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing')) ] )
def test_decorator_function_calls(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) @dataset.function(returns=dict(name="initials", type="STRING")) def initials(name): if name: return ''.join([p[0] for p in name.split()]) else: return None q = Query( dataset, ProjectionOp(LoadOp('employees'), Function('initials', Var('full_name')))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [ ('TT', ), ('SS', ), ('MM', ), ])
def test_get_schema(): class Adapter(object): def has(self, name): return name == 'computed' def evaluate(self, loc): return loc.replace(Function('myschema')) dataset = DataSet() dataset.add_adapter(Adapter()) # Todo: figure out why I have to invoke this decorator here @dataset.function(returns = lambda:Schema([dict(name='field', type='string')])) def myschema(ctx): pass schema = dataset.get_schema('computed') eq_( schema, Schema([dict(name='field', type='string')]) )
def test_replace_view_within_a_view(): dataset = DataSet() adapter = dataset.add_adapter(MockAdapter()) dataset.create_view('view1', LoadOp('bogus')) dataset.create_view('view2', LoadOp('view1')) dataset.create_view( 'view3', SelectionOp(LoadOp('view2'), IsOp(Var('x'), NullConst()))) v1 = replace_views(LoadOp('view3'), dataset) compare( v1, AliasOp( 'view3', SelectionOp(AliasOp('view2', AliasOp('view1', LoadOp('bogus'))), IsOp(Var('x'), NullConst()))))
def test_complier(): adapter = MockAdapter() def compile(query): return lambda ctx, *params: Table( adapter, 'results!', schema=dict(fields=[dict(name="?column?", type="INTEGER")])) dataset = DataSet() dataset.add_adapter(adapter) dataset.set_compiler(compile) query = dataset.frm('bogus').query table = dataset.execute(query)
def test_function_in_from(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, ProjectionOp( Function('flatten', LoadOp('employees'), StringConst('roles')), Var('manager_id'), Var('roles') ) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (1234, 'sales'), (1234, 'marketing') ] ) q = Query( dataset, Function('flatten', LoadOp('employees'), StringConst('roles')) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ [8901, 'Mark Markty', date(2010, 3, 1), 1234, 'sales'], [8901, 'Mark Markty', date(2010, 3, 1), 1234, 'marketing'] ] )
def test_self_join(): dataset = DataSet() dataset.add_adapter(EmployeeAdapter()) q = Query( dataset, JoinOp( AliasOp('employee', LoadOp('employees')), AliasOp('manager', LoadOp('employees')), EqOp(Var('manager.employee_id'), Var('employee.manager_id')) ) ) evaluate = compile(q) assert_sequence_equal( list(evaluate(dict(dataset=dataset))), [ (4567, 'Sally Sanders', date(2010, 2, 24), 1234, (), 1234, 'Tom Tompson', date(2009, 1, 17), None, ()), (8901, 'Mark Markty', date(2010, 3, 1), 1234, ('sales', 'marketing'), 1234, 'Tom Tompson', date(2009, 1, 17), None, ()) ] )
def test_projection_wo_relation(): """ This is equivalent to select statements w/o from clauses in sql adapters. select 1; | col1 | +------+ | 1 | +------+ """ dataset = DataSet() q = Query(dataset, ProjectionOp(LoadOp(''), NumberConst(1))) evaluate = compile(q) assert_sequence_equal(list(evaluate(dict(dataset=dataset))), [(1, )])
def test_complier(): adapter = MockAdapter() def compile(query): return lambda ctx, *params: Table( adapter, 'results!', schema = dict( fields = [ dict(name="?column?", type="INTEGER") ] ) ) dataset = DataSet() dataset.add_adapter(adapter) dataset.set_compiler(compile) query = dataset.frm('bogus').query table = dataset.execute(query)
def mock_data_set(): dataset = DataSet() dataset.add_adapter(MockAdapter()) dataset.add_adapter(EmployeeAdapter()) return dataset
def init(**tables): """ Returns a dataset to work with the discodb specified by path """ dataset = DataSet() dataset.add_server(DiscoDBServer(**tables)) dataset.add_server(FileServer( common_crawl=dict( root_dir=join(dirname(__file__), 'data'), pattern="sample.arc.gz", decode="application/x-arc", #description="Raw documents from http://commoncrawl.org" ) )) dataset.add_server(FileServer( top_sites=dict( #description="Top Sites as reported by Alexa", root_dir=join(dirname(__file__), 'data'), pattern="alexa-top1m-{date}.csv", decode="auto", schema=dict( fields=[ dict(name="date", type="DATE"), dict(name="rank", type="STRING"), dict(name="site", type="STRING") ] ) ) )) dataset.frm('top_sites').limit(10).create_view('top_10') dataset.create_view( 'outbound_links', "select link_to, count() " "from flatten(docs, 'link_to') " "group by link_to order by count desc" ) dataset.create_view( 'scripts', "select scripts as script, count() " "from flatten(docs, 'scripts') " "group by script order by count desc" ) dataset.create_view( 'servers', "select headers_value as server_name, count() " "from flatten(docs, 'headers') " "where headers_name = 'Server' " "group by server_name order by count desc" ) return dataset
def test_query(): dataset = DataSet() dataset.query('select 1').execute()