def test_query_field_from_path(): """ Queries with SelectionOps that reference only fields parsed from the directory structre will rewrite the query so that the file list is filtered without opening/decoding the files. """ adapter = DirAdapter(employees=dict(root_dir="/", pattern="{department}", filename_column="path", decode="auto", schema=TEST_SCHEMA)) op = SelectionOp(LoadOp('employees'), EqOp(Var('department'), Const('sales'))) loc = query_zipper(op).leftmost_descendant() res = adapter.evaluate(loc) relation = adapter.get_relation('employees') compare( res.root(), Function( 'decode', SelectionOp( Function('extract_path', Function('files', Const(relation.root_dir)), Const(relation.root_dir + "{department}")), EqOp(Var('department'), Const('sales'))), Const('auto'), Const(TEST_SCHEMA), Const('path')))
def test_query_field_in_payload(): """ Querying a field inside the payload should result in the LoadOp being rewritten as SelectionOp(Function('decode', Function('extract_path', Function('files')))) """ adapter = DirAdapter(employees=dict(root_dir="/", pattern="{department}", filename_column="path", decode="auto", schema=TEST_SCHEMA)) op = SelectionOp(LoadOp('employees'), GeOp(Var('salary'), Const(40000))) loc = query_zipper(op).leftmost_descendant() res = adapter.evaluate(loc) relation = adapter.get_relation('employees') compare( res.root(), SelectionOp( Function( 'decode', Function('extract_path', Function('files', Const(relation.root_dir)), Const(relation.root_dir + "{department}")), Const('auto'), Const(TEST_SCHEMA), Const('path')), GeOp(Var('salary'), Const(40000))))
def compare(op1, op2): loc1 = query_zipper(op1).leftmost_descendant() loc2 = query_zipper(op2).leftmost_descendant() while True: n1 = loc1.node() n2 = loc2.node() if not (compare_relation(n1,n2) or n1 == n2): raise NodeDiffException(n1,n2) if any((loc1.at_end(), loc2.at_end())): # if either is at the end, they both should be assert loc1.at_end() == loc2.at_end() break else: loc1 = loc1.postorder_next() loc2 = loc2.postorder_next()
def test_query_field_from_path_and_contents(): """ Queries with SelectionOps that reference both fields parsed from the directory structre and content will rewrite the query so that the file list is filtered before opening/decoding the files and finally filtered by the field from the content """ adapter = DirAdapter( employees=dict(root_dir="/", pattern="{department}", filename_column="path", decode="auto", schema=dict(fields=[ dict(type='STRING', name='department'), dict(type='INTEGER', name='id'), dict(type='STRING', name='full_name'), dict(type='INTEGER', name='salary'), dict(type='INTEGER', name='manager_id'), ]))) op = SelectionOp( LoadOp('employees'), And( EqOp(Var('department'), Const('sales')), GeOp(Var('salary'), Const(40000)), )) loc = query_zipper(op).leftmost_descendant() res = adapter.evaluate(loc) relation = adapter.get_relation('employees') compare( res.root(), SelectionOp( Function( 'decode', SelectionOp( Function('extract_path', Function('files', Const(relation.root_dir)), Const(relation.root_dir + "{department}")), EqOp(Var('department'), Const('sales'))), Const('auto'), Const(TEST_SCHEMA), Const('path')), GeOp(Var('salary'), Const(40000))))
def test_evaluate(): adapter = DirAdapter(songs=dict( root_dir=path, pattern="{artist}/{album}/{track}.{ext}", filename_column="path", )) relation = adapter.get_relation('songs') op = LoadOp('songs') loc = query_zipper(op).leftmost_descendant() res = adapter.evaluate(loc) compare( res.root(), Function('extract_path', Function('files', Const(relation.root_dir)), Const(path + "/{artist}/{album}/{track}.{ext}")))
def test_evaluate(): adapter = S3Adapter( logs = dict( bucket = "aws-publicdatasets", anon = True, prefix = "/common-crawl/", pattern = "{timestamp}/{server}" ) ) relation = adapter.get_relation('logs') op = LoadOp('logs') loc = query_zipper(op).leftmost_descendant() res = adapter.evaluate(loc) import pdb; pdb.set_trace() eq_( res.root(), Function('s3_keys', Const(relation.bucket_name), Const(relation.prefix)) )