async def test_filter_iterator_interrupt(): expression = "?p = <http://schema.org/eligibleRegion>" iterator, card = hdtDoc.search(triple['subject'], triple['predicate'], triple['object']) scan = ProjectionIterator(ScanIterator(iterator, triple, card)) iterator = FilterIterator(scan, expression) (results, saved, done, _) = await engine.execute(iterator, 10e-7, 2) assert len(results) <= 4 for b in results: assert b['?p'] == 'http://schema.org/eligibleRegion' assert b['?o'] in [ 'http://db.uwaterloo.ca/~galuc/wsdbm/Country0', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country1', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country4', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country9' ] tmp = len(results) reloaded = load(saved.SerializeToString(), DummyDataset(hdtDoc, 'watdiv100')) (results, saved, done, _) = await engine.execute(reloaded, 10e7) assert len(results) + tmp == 4 for b in results: assert b['?p'] == 'http://schema.org/eligibleRegion' assert b['?o'] in [ 'http://db.uwaterloo.ca/~galuc/wsdbm/Country0', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country1', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country4', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country9' ] assert done
async def test_filter_iterator_interrupt(): context = { 'quantum': 10e-7, 'max_results': 10e7 } expression = "?p = <http://schema.org/eligibleRegion>" scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context) iterator = FilterIterator(scan, expression, context) (results, saved, done, _) = await engine.execute(iterator, context) assert len(results) <= 4 for b in results: assert b['?p'] == 'http://schema.org/eligibleRegion' assert b['?o'] in [ 'http://db.uwaterloo.ca/~galuc/wsdbm/Country0', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country1', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country4', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country9' ] tmp = len(results) context['quantum'] = 10e7 reloaded = load(saved.SerializeToString(), DummyDataset(hdtDoc, 'watdiv100'), context) (results, saved, done, _) = await engine.execute(reloaded, context) assert len(results) + tmp == 4 for b in results: assert b['?p'] == 'http://schema.org/eligibleRegion' assert b['?o'] in [ 'http://db.uwaterloo.ca/~galuc/wsdbm/Country0', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country1', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country4', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country9' ] assert done
async def test_operation_filter_iterator(): context = { 'quantum': 10e7, 'max_results': 10e7 } expression = "10 = 5 * 2" scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context) iterator = FilterIterator(scan, expression, context) (results, saved, done, _) = await engine.execute(iterator, context) assert len(results) == 9
def build_query_plan(query, dataset, default_graph, saved_plan=None): """Build a pipeline of iterators used to evaluate a query""" cardinalities = [] if saved_plan is not None: return load(saved_plan, dataset), [] root = None if query['type'] == 'union': root, cardinalities = build_union_plan(query['union'], dataset, default_graph) elif query['type'] == 'bgp': root, cardinalities = build_join_plan(query['bgp'], dataset, default_graph) else: raise Exception('Unkown query type found during query optimization') # apply (possible) filter clause(s) if 'filters' in query and len(query['filters']) > 0: # exclude empty strings filters = list(filter(lambda x: len(x) > 0, query['filters'])) if len(filters) > 0: # reduce all filters in a conjunctive expression expression = reduce(lambda x, y: "({}) && ({})".format(x, y), filters) root = FilterIterator(root, expression) return root, cardinalities
async def test_function_filter_iterator(): expression = '?p = <http://purl.org/goodrelations/price> && isLiteral(?o) && !isNumeric(?o)' iterator, card = hdtDoc.search(triple['subject'], triple['predicate'], triple['object']) scan = ProjectionIterator(ScanIterator(iterator, triple, card)) iterator = FilterIterator(scan, expression) (results, saved, done, _) = await engine.execute(iterator, math.inf) assert len(results) == 1
async def test_operation_filter_iterator(): expression = "10 = 5 * 2" iterator, card = hdtDoc.search(triple['subject'], triple['predicate'], triple['object']) scan = ProjectionIterator(ScanIterator(iterator, triple, card)) iterator = FilterIterator(scan, expression) (results, saved, done, _) = await engine.execute(iterator, math.inf) assert len(results) == 9
def load_filter(saved_plan, dataset): """Load a FilterIterator from a protobuf serialization""" sourceField = saved_plan.WhichOneof('source') source = load(getattr(saved_plan, sourceField), dataset) mu = None if len(saved_plan.mu) > 0: mu = saved_plan.mu return FilterIterator(source, saved_plan.expression, mu=mu)
async def test_and_or_filter_iterator(): context = { 'quantum': 10e7, 'max_results': 10e7 } expression = "?p = <http://schema.org/eligibleRegion> && (?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country0> || ?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country9>)" scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context) iterator = FilterIterator(scan, expression, context) (results, saved, done, _) = await engine.execute(iterator, context) assert len(results) == 2 for b in results: assert b['?p'] == 'http://schema.org/eligibleRegion' assert b['?o'] in [ 'http://db.uwaterloo.ca/~galuc/wsdbm/Country0', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country9' ]
async def test_and_or_filter_iterator(): expression = "?p = <http://schema.org/eligibleRegion> && (?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country0> || ?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country9>)" iterator, card = hdtDoc.search(triple['subject'], triple['predicate'], triple['object']) scan = ProjectionIterator(ScanIterator(iterator, triple, card)) iterator = FilterIterator(scan, expression) (results, saved, done, _) = await engine.execute(iterator, math.inf) assert len(results) == 2 for b in results: assert b['?p'] == 'http://schema.org/eligibleRegion' assert b['?o'] in [ 'http://db.uwaterloo.ca/~galuc/wsdbm/Country0', 'http://db.uwaterloo.ca/~galuc/wsdbm/Country9' ]
def load_filter(saved_plan: SavedFilterIterator, dataset: Dataset, context: dict) -> PreemptableIterator: """Load a FilterIterator from a protobuf serialization. Args: * saved_plan: Saved query execution plan. * dataset: RDF dataset used to execute the plan. * context: Information about the query execution. Returns: The pipeline of iterator used to continue query execution. """ sourceField = saved_plan.WhichOneof('source') source = load(getattr(saved_plan, sourceField), dataset, context) return FilterIterator(source, saved_plan.expression, context)
def load_filter(saved_plan: SavedFilterIterator, dataset: Dataset) -> PreemptableIterator: """Load a FilterIterator from a protobuf serialization. Args: * saved_plan: Saved query execution plan. * dataset: RDF dataset used to execute the plan. Returns: The pipeline of iterator used to continue query execution. """ sourceField = saved_plan.WhichOneof('source') source = load(getattr(saved_plan, sourceField), dataset) mu = None if len(saved_plan.mu) > 0: mu = saved_plan.mu return FilterIterator(source, saved_plan.expression, mu=mu)
def build_left_join_tree( bgp: List[Dict[str, str]], dataset: Dataset, default_graph: str, as_of: Optional[datetime] = None ) -> Tuple[PreemptableIterator, List[str], Dict[str, str]]: """Build a Left-linear join tree from a Basic Graph pattern. Args: * bgp: Basic Graph pattern used to build the join tree. * dataset: RDF dataset on which the BGPC is evaluated. * default_graph: URI of the default graph used for BGP evaluation. * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation. Returns: A tuple (`iterator`, `query_vars`, `cardinalities`) where: * `iterator` is the root of the Left-linear join tree. * `query_vars` is the list of all SPARQL variables found in the BGP. * `cardinalities` is the list of estimated cardinalities of all triple patterns in the BGP. """ # gather metadata about triple patterns triples = [] cardinalities = [] # not common but # happen in query insert where { bind} if len(bgp) == 0: return EmptyIterator(), [], [] # analyze each triple pattern in the BGP for triple in bgp: # select the graph used to evaluate the pattern graph_uri = triple['graph'] if 'graph' in triple and len( triple['graph']) > 0 else default_graph triple['graph'] = graph_uri # get iterator and statistics about the pattern if dataset.has_graph(graph_uri): it, c = dataset.get_graph(graph_uri).search(triple['subject'], triple['predicate'], triple['object'], as_of=as_of) else: it, c = EmptyIterator(), 0 triples += [{'triple': triple, 'cardinality': c, 'iterator': it}] cardinalities += [{'triple': triple, 'cardinality': c}] # sort triples by ascending cardinality triples = sorted(triples, key=lambda v: v['cardinality']) # start the pipeline with the Scan with the most selective pattern pattern = triples.pop(0) query_vars = get_vars(pattern['triple']) # add a equality filter if the pattern has several variables that binds to the same value # example: ?s rdf:type ?s => Filter(Scan(?s rdf:type ?s_2), ?s == ?s_2) eq_expr, new_pattern = equality_variables(pattern['triple']['subject'], pattern['triple']['predicate'], pattern['triple']['object']) if eq_expr is not None: # copy pattern with rewritten values triple = pattern['triple'].copy() triple["subject"] = new_pattern[0] triple["predicate"] = new_pattern[1] triple["object"] = new_pattern[2] # build a pipline with Index Scan + Equality filter pipeline = ScanIterator(pattern['iterator'], triple, pattern['cardinality']) pipeline = FilterIterator(pipeline, eq_expr) # update query variables query_vars = query_vars | get_vars(triple) else: pipeline = ScanIterator(pattern['iterator'], pattern['triple'], pattern['cardinality']) # build the left linear tree of joins while len(triples) > 0: pattern, pos, query_vars = find_connected_pattern(query_vars, triples) # no connected pattern = disconnected BGP => pick the first remaining pattern in the BGP if pattern is None: pattern = triples[0] query_vars = query_vars | get_vars(pattern['triple']) pos = 0 graph_uri = pattern['triple']['graph'] pipeline = IndexJoinIterator(pipeline, pattern['triple'], dataset.get_graph(graph_uri), as_of=as_of) triples.pop(pos) return pipeline, query_vars, cardinalities
def parse_query_node(node, dataset, current_graphs, server_url, cardinalities, renaming_map=None): """ Recursively parse node in the query logical plan to build a preemptable physical query execution plan. Args: * node - Node of the logical plan to parse (in rdflib format) * dataset - RDF dataset used to execute the query * current_graphs - List of IRI of the current RDF graph queried * server_url - URL of the SaGe server * cardinalities - Map<triple,integer> used to track triple patterns cardinalities """ if node.name == 'SelectQuery': # in case of a FROM clause, set the new default graphs used graphs = current_graphs if node.datasetClause is not None: graphs = [ format_graph_uri(format_term(graph_iri.default), server_url) for graph_iri in node.datasetClause ] return parse_query_node(node.p, dataset, graphs, server_url, cardinalities) elif node.name == 'Project': query_vars = list(map(lambda t: t.n3(), node.PV)) if node.p.name == 'AggregateJoin' or node.p.name == 'Extend': # forward projection variables, as we need them for parsing an AggregateJoin node.p['PV'] = query_vars return parse_query_node(node.p, dataset, current_graphs, server_url, cardinalities) child = parse_query_node(node.p, dataset, current_graphs, server_url, cardinalities) return ProjectionIterator(child, dataset, current_graphs[0], query_vars) elif node.name == 'BGP': # bgp_vars = node._vars triples = list(localize_triple(node.triples, current_graphs)) # format triple patterns for the backend API patterns = [] for triple in triples: graph_uri = triple[ 'graph'] if 'graph' in triple else current_graphs[0] graph = dataset.get_graph(graph_uri) patterns.append({ 'subject': triple['subject'] if triple['subject'].startswith('?') else graph.get_identifiant(triple['subject']), 'predicate': triple['predicate'] if triple['predicate'].startswith('?') else graph.get_identifiant(triple['predicate']), 'object': triple['object'] if triple['object'].startswith('?') else graph.get_identifiant(triple['object']), 'graph': graph_uri }) iterator, query_vars, c = build_left_plan(patterns, dataset, current_graphs) # track cardinalities of every triple pattern cardinalities += c return iterator elif node.name == 'Union': left = parse_query_node(node.p1, dataset, current_graphs, server_url, cardinalities) right = parse_query_node(node.p2, dataset, current_graphs, server_url, cardinalities) return BagUnionIterator(left, right) elif node.name == 'Filter': expression = parse_filter_expr(node.expr) iterator = parse_query_node(node.p, dataset, current_graphs, server_url, cardinalities) return FilterIterator(iterator, expression) elif node.name == 'Join': # only allow for joining BGPs from different GRAPH clauses triples = fetch_graph_triples(node.p1, current_graphs, server_url) triples += fetch_graph_triples(node.p2, current_graphs, server_url) iterator, query_vars, c = build_left_plan(triples, dataset, current_graphs) # track cardinalities of every triple pattern cardinalities += c return iterator elif node.name == 'Extend': # remove all extend operators, as they are not needed current = node renaming = dict() while current.name == 'Extend': renaming[current.expr.n3()] = current.var.n3() current = current.p current['PV'] = node['PV'] return parse_query_node(current, dataset, current_graphs, server_url, cardinalities, renaming_map=renaming) elif node.name == 'AggregateJoin': groupby_variables = list() # build GROUP BY variables last_groupby_var = None if node.p.expr is None: # case 1: no explicit group BY, so we group by all variables in the query last_groupby_var = list(node.p._vars)[0] # for variable in node.p._vars: # groupby_variables.append(variable.n3()) # last_groupby_var = variable else: # case 2: there is an explicit group by for variable in node.p.expr: groupby_variables.append(variable.n3()) last_groupby_var = variable # build aggregators for evaluating SPARQL aggregations (if any) aggregators = list() for agg in node.A: if agg.vars == '*': agg.vars = last_groupby_var if agg.name != 'Aggregate_Sample': aggregators.append(build_aggregator(dataset, agg, renaming_map)) # build source iterator from child node source = parse_query_node(node.p.p, dataset, current_graphs, server_url, cardinalities) # add the GROUP BY operator (with aggregators) to the pipeline source = GroupByAggregator(source, groupby_variables, aggregators=aggregators, max_size=dataset.max_group_by_size) # add the projection to the pipeline, depending of the context return AggregatesProjectionIterator(source, dataset, current_graphs[0], node.PV) else: raise UnsupportedSPARQL("Unsupported SPARQL feature: {}".format( node.name))
def parse_query_alt(node: dict, dataset: Dataset, current_graphs: List[str], cardinalities: dict, as_of: Optional[datetime] = None) -> PreemptableIterator: """Recursively parse node in the query logical plan to build a preemptable physical query execution plan. Args: * node: Node of the logical plan to parse (in rdflib format). * dataset: RDF dataset used to execute the query. * current_graphs: List of IRI of the current RDF graphs queried. * cardinalities: A dict used to track triple patterns cardinalities. * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation. Returns: An iterator used to evaluate the input node. Throws: `UnsupportedSPARQL` is the SPARQL query contains features not supported by the SaGe query engine. """ if node.name == 'SelectQuery': # in case of a FROM clause, set the new default graphs used graphs = current_graphs if node.datasetClause is not None: graphs = [ format_term(graph_iri.default) for graph_iri in node.datasetClause ] return parse_query_alt(node.p, dataset, graphs, cardinalities, as_of=as_of) elif node.name == 'ConstructQuery': graphs = current_graphs if node.datasetClause is not None: graphs = [ format_term(graph_iri.default) for graph_iri in node.datasetClause ] child = parse_query_alt(node.p, dataset, graphs, cardinalities, as_of=as_of) return ConstructIterator(child, convert_construct_template(node.template)) elif node.name == 'Reduced': child = parse_query_alt(node.p, dataset, current_graphs, cardinalities, as_of=as_of) return ReducedIterator(child) elif node.name == 'Project': query_vars = list(map(lambda t: '?' + str(t), node.PV)) child = parse_query_alt(node.p, dataset, current_graphs, cardinalities, as_of=as_of) return ProjectionIterator(child, query_vars) elif node.name == 'BGP': # bgp_vars = node._vars triples = list(localize_triples(node.triples, current_graphs)) iterator, query_vars, c = build_left_join_tree(triples, dataset, current_graphs, as_of=as_of) # track cardinalities of every triple pattern cardinalities += c return iterator elif node.name == 'Union': left = parse_query_alt(node.p1, dataset, current_graphs, cardinalities, as_of=as_of) right = parse_query_alt(node.p2, dataset, current_graphs, cardinalities, as_of=as_of) return BagUnionIterator(left, right) elif node.name == 'Filter': expression = parse_filter_expr(node.expr) iterator = parse_query_alt(node.p, dataset, current_graphs, cardinalities, as_of=as_of) return FilterIterator(iterator, expression) elif node.name == 'Extend': bgp_iterator = parse_query_alt(node.p, dataset, current_graphs, cardinalities, as_of=as_of) expression = parse_bind_expr(node.expr) #print("expression:"+str(expression)) if isinstance(bgp_iterator, EmptyIterator): return BindIterator(None, expression, '?' + node.var) else: return BindIterator(bgp_iterator, expression, '?' + node.var) elif node.name == 'Join': left = parse_query_alt(node.p1, dataset, current_graphs, cardinalities, as_of=as_of) if node.p2.name == 'BGP': triples = list(localize_triples(node.p2.triples, current_graphs)) variables = set(map(lambda t: t.n3(), node.p1._vars)) #print("Join P1 _vars"+str(variables)) iterator, query_vars, c = continue_left_join_tree( left, variables, triples, dataset, current_graphs) cardinalities += c return iterator else: raise UnsupportedSPARQL( f"Join Unsupported SPARQL feature: {node.p2.name}") else: raise UnsupportedSPARQL(f"Unsupported SPARQL feature: {node.name}")
def parse_query_node(node: dict, dataset: Dataset, current_graphs: List[str], cardinalities: dict, as_of: Optional[datetime] = None) -> PreemptableIterator: """Recursively parse node in the query logical plan to build a preemptable physical query execution plan. Args: * node: Node of the logical plan to parse (in rdflib format). * dataset: RDF dataset used to execute the query. * current_graphs: List of IRI of the current RDF graphs queried. * cardinalities: A dict used to track triple patterns cardinalities. * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation. Returns: An iterator used to evaluate the input node. Throws: `UnsupportedSPARQL` is the SPARQL query contains features not supported by the SaGe query engine. """ if node.name == 'SelectQuery': # in case of a FROM clause, set the new default graphs used graphs = current_graphs if node.datasetClause is not None: graphs = [ format_term(graph_iri.default) for graph_iri in node.datasetClause ] return parse_query_node(node.p, dataset, graphs, cardinalities, as_of=as_of) elif node.name == 'Project': query_vars = list(map(lambda t: '?' + str(t), node.PV)) child = parse_query_node(node.p, dataset, current_graphs, cardinalities, as_of=as_of) return ProjectionIterator(child, query_vars) elif node.name == 'BGP': # bgp_vars = node._vars triples = list(localize_triples(node.triples, current_graphs)) iterator, query_vars, c = build_left_join_tree(triples, dataset, current_graphs, as_of=as_of) # track cardinalities of every triple pattern cardinalities += c return iterator elif node.name == 'Union': left = parse_query_node(node.p1, dataset, current_graphs, cardinalities, as_of=as_of) right = parse_query_node(node.p2, dataset, current_graphs, cardinalities, as_of=as_of) return BagUnionIterator(left, right) elif node.name == 'Filter': expression = parse_filter_expr(node.expr) iterator = parse_query_node(node.p, dataset, current_graphs, cardinalities, as_of=as_of) return FilterIterator(iterator, expression) elif node.name == 'Join': # only allow for joining BGPs from different GRAPH clauses triples = get_triples_from_graph( node.p1, current_graphs) + get_triples_from_graph( node.p2, current_graphs) iterator, query_vars, c = build_left_join_tree(triples, dataset, current_graphs) # track cardinalities of every triple pattern cardinalities += c return iterator else: raise UnsupportedSPARQL(f"Unsupported SPARQL feature: {node.name}")
def build_left_plan(bgp, dataset, default_graph): """Build a Left-linear tree of joins from a BGP""" # gather metadata about triple patterns triples = [] cardinalities = [] # analyze each triple pattern in the BGP for triple in bgp: # select the graph used to evaluate the pattern graph_uri = triple['graph'] if 'graph' in triple and len( triple['graph']) > 0 else default_graph triple['graph'] = graph_uri # get iterator and statistics about the pattern if dataset.has_graph(graph_uri): it, c = dataset.get_graph(graph_uri).search( triple['subject'], triple['predicate'], triple['object']) else: it, c = EmptyIterator(), 0 triples += [{'triple': triple, 'cardinality': c, 'iterator': it}] cardinalities += [{'triple': triple, 'cardinality': c}] # sort triples by ascending cardinality triples = sorted(triples, key=lambda v: v['cardinality']) # start the pipeline with the Scan with the most selective pattern pattern = triples.pop(0) query_vars = get_vars(pattern['triple']) # add a equality filter if the pattern has several variables that binds to the same value # example: ?s rdf:type ?s => Filter(Scan(?s rdf:type ?s_2), ?s == ?s_2) eq_expr, new_pattern = equality_variables(pattern['triple']['subject'], pattern['triple']['predicate'], pattern['triple']['object']) if eq_expr is not None: # copy pattern with rewritten values triple = pattern['triple'].copy() triple["subject"] = new_pattern[0] triple["predicate"] = new_pattern[1] triple["object"] = new_pattern[2] # build a pipline with Index Scan + Equality filter pipeline = ScanIterator(pattern['iterator'], triple, pattern['cardinality']) pipeline = FilterIterator(pipeline, eq_expr) # update query variables query_vars = query_vars | get_vars(triple) else: pipeline = ScanIterator(pattern['iterator'], pattern['triple'], pattern['cardinality']) # build the left linear tree of joins while len(triples) > 0: pattern, pos, query_vars = find_connected_pattern(query_vars, triples) # no connected pattern = disconnected BGP => pick the first remaining pattern in the BGP if pattern is None: pattern = triples[0] query_vars = query_vars | get_vars(pattern['triple']) pos = 0 graph_uri = pattern['triple']['graph'] pipeline = IndexJoinIterator(pipeline, pattern['triple'], dataset.get_graph(graph_uri)) triples.pop(pos) return pipeline, query_vars, cardinalities