def init(**tables): """ Returns a dataset to work with the discodb specified by path """ dataset = DataSet() dataset.add_server(DiscoDBServer(**tables)) dataset.add_server(FileServer( common_crawl=dict( root_dir=join(dirname(__file__), 'data'), pattern="sample.arc.gz", decode="application/x-arc", #description="Raw documents from http://commoncrawl.org" ) )) dataset.add_server(FileServer( top_sites=dict( #description="Top Sites as reported by Alexa", root_dir=join(dirname(__file__), 'data'), pattern="alexa-top1m-{date}.csv", decode="auto", schema=dict( fields=[ dict(name="date", type="DATE"), dict(name="rank", type="STRING"), dict(name="site", type="STRING") ] ) ) )) dataset.frm('top_sites').limit(10).create_view('top_10') dataset.create_view( 'outbound_links', "select link_to, count() " "from flatten(docs, 'link_to') " "group by link_to order by count desc" ) dataset.create_view( 'scripts', "select scripts as script, count() " "from flatten(docs, 'scripts') " "group by script order by count desc" ) dataset.create_view( 'servers', "select headers_value as server_name, count() " "from flatten(docs, 'headers') " "where headers_name = 'Server' " "group by server_name order by count desc" ) return dataset
def test_complier(): adapter = MockAdapter() def compile(query): return lambda ctx, *params: Table( adapter, 'results!', schema=dict(fields=[dict(name="?column?", type="INTEGER")])) dataset = DataSet() dataset.add_adapter(adapter) dataset.set_compiler(compile) query = dataset.frm('bogus').query table = dataset.execute(query)
def test_complier(): adapter = MockAdapter() def compile(query): return lambda ctx, *params: Table( adapter, 'results!', schema = dict( fields = [ dict(name="?column?", type="INTEGER") ] ) ) dataset = DataSet() dataset.add_adapter(adapter) dataset.set_compiler(compile) query = dataset.frm('bogus').query table = dataset.execute(query)