示例#1
0
async def test_filter_iterator_interrupt():
    expression = "?p = <http://schema.org/eligibleRegion>"
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, 10e-7, 2)
    assert len(results) <= 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    tmp = len(results)
    reloaded = load(saved.SerializeToString(),
                    DummyDataset(hdtDoc, 'watdiv100'))
    (results, saved, done, _) = await engine.execute(reloaded, 10e7)
    assert len(results) + tmp == 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    assert done
async def test_filter_iterator_interrupt():
    context = { 'quantum': 10e-7, 'max_results': 10e7 }
    expression = "?p = <http://schema.org/eligibleRegion>"
    scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context)
    iterator = FilterIterator(scan, expression, context)
    (results, saved, done, _) = await engine.execute(iterator, context)
    assert len(results) <= 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    tmp = len(results)
    context['quantum'] = 10e7
    reloaded = load(saved.SerializeToString(), DummyDataset(hdtDoc, 'watdiv100'), context)
    (results, saved, done, _) = await engine.execute(reloaded, context)
    assert len(results) + tmp == 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    assert done
async def test_operation_filter_iterator():
    context = { 'quantum': 10e7, 'max_results': 10e7 }
    expression = "10 = 5 * 2"
    scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context)
    iterator = FilterIterator(scan, expression, context)
    (results, saved, done, _) = await engine.execute(iterator, context)
    assert len(results) == 9
示例#4
0
def build_query_plan(query, dataset, default_graph, saved_plan=None):
    """Build a pipeline of iterators used to evaluate a query"""
    cardinalities = []
    if saved_plan is not None:
        return load(saved_plan, dataset), []

    root = None
    if query['type'] == 'union':
        root, cardinalities = build_union_plan(query['union'], dataset,
                                               default_graph)
    elif query['type'] == 'bgp':
        root, cardinalities = build_join_plan(query['bgp'], dataset,
                                              default_graph)
    else:
        raise Exception('Unkown query type found during query optimization')

    # apply (possible) filter clause(s)
    if 'filters' in query and len(query['filters']) > 0:
        # exclude empty strings
        filters = list(filter(lambda x: len(x) > 0, query['filters']))
        if len(filters) > 0:
            # reduce all filters in a conjunctive expression
            expression = reduce(lambda x, y: "({}) && ({})".format(x, y),
                                filters)
            root = FilterIterator(root, expression)
    return root, cardinalities
示例#5
0
async def test_function_filter_iterator():
    expression = '?p = <http://purl.org/goodrelations/price> && isLiteral(?o) && !isNumeric(?o)'
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, math.inf)
    assert len(results) == 1
示例#6
0
async def test_operation_filter_iterator():
    expression = "10 = 5 * 2"
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, math.inf)
    assert len(results) == 9
def load_filter(saved_plan, dataset):
    """Load a FilterIterator from a protobuf serialization"""
    sourceField = saved_plan.WhichOneof('source')
    source = load(getattr(saved_plan, sourceField), dataset)
    mu = None
    if len(saved_plan.mu) > 0:
        mu = saved_plan.mu
    return FilterIterator(source, saved_plan.expression, mu=mu)
async def test_and_or_filter_iterator():
    context = { 'quantum': 10e7, 'max_results': 10e7 }
    expression = "?p = <http://schema.org/eligibleRegion> && (?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country0> || ?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country9>)"
    scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context)
    iterator = FilterIterator(scan, expression, context)
    (results, saved, done, _) = await engine.execute(iterator, context)
    assert len(results) == 2
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
示例#9
0
async def test_and_or_filter_iterator():
    expression = "?p = <http://schema.org/eligibleRegion> && (?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country0> || ?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country9>)"
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, math.inf)
    assert len(results) == 2
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
示例#10
0
def load_filter(saved_plan: SavedFilterIterator, dataset: Dataset, context: dict) -> PreemptableIterator:
    """Load a FilterIterator from a protobuf serialization.

    Args:
      * saved_plan: Saved query execution plan.
      * dataset: RDF dataset used to execute the plan.
      * context: Information about the query execution.

    Returns:
      The pipeline of iterator used to continue query execution.
    """
    sourceField = saved_plan.WhichOneof('source')
    source = load(getattr(saved_plan, sourceField), dataset, context)
    return FilterIterator(source, saved_plan.expression, context)
示例#11
0
def load_filter(saved_plan: SavedFilterIterator, dataset: Dataset) -> PreemptableIterator:
    """Load a FilterIterator from a protobuf serialization.

    Args:
      * saved_plan: Saved query execution plan.
      * dataset: RDF dataset used to execute the plan.

    Returns:
      The pipeline of iterator used to continue query execution.
    """
    sourceField = saved_plan.WhichOneof('source')
    source = load(getattr(saved_plan, sourceField), dataset)
    mu = None
    if len(saved_plan.mu) > 0:
        mu = saved_plan.mu
    return FilterIterator(source, saved_plan.expression, mu=mu)
示例#12
0
def build_left_join_tree(
    bgp: List[Dict[str, str]],
    dataset: Dataset,
    default_graph: str,
    as_of: Optional[datetime] = None
) -> Tuple[PreemptableIterator, List[str], Dict[str, str]]:
    """Build a Left-linear join tree from a Basic Graph pattern.

    Args:
      * bgp: Basic Graph pattern used to build the join tree.
      * dataset: RDF dataset on which the BGPC is evaluated.
      * default_graph: URI of the default graph used for BGP evaluation.
      * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation.

    Returns: A tuple (`iterator`, `query_vars`, `cardinalities`) where:
      * `iterator` is the root of the Left-linear join tree.
      * `query_vars` is the list of all SPARQL variables found in the BGP.
      * `cardinalities` is the list of estimated cardinalities of all triple patterns in the BGP.
    """
    # gather metadata about triple patterns
    triples = []
    cardinalities = []

    # not common but
    # happen in query insert where { bind}
    if len(bgp) == 0:
        return EmptyIterator(), [], []

    # analyze each triple pattern in the BGP
    for triple in bgp:
        # select the graph used to evaluate the pattern
        graph_uri = triple['graph'] if 'graph' in triple and len(
            triple['graph']) > 0 else default_graph
        triple['graph'] = graph_uri
        # get iterator and statistics about the pattern
        if dataset.has_graph(graph_uri):
            it, c = dataset.get_graph(graph_uri).search(triple['subject'],
                                                        triple['predicate'],
                                                        triple['object'],
                                                        as_of=as_of)
        else:
            it, c = EmptyIterator(), 0
        triples += [{'triple': triple, 'cardinality': c, 'iterator': it}]
        cardinalities += [{'triple': triple, 'cardinality': c}]

    # sort triples by ascending cardinality
    triples = sorted(triples, key=lambda v: v['cardinality'])

    # start the pipeline with the Scan with the most selective pattern
    pattern = triples.pop(0)
    query_vars = get_vars(pattern['triple'])

    # add a equality filter if the pattern has several variables that binds to the same value
    # example: ?s rdf:type ?s => Filter(Scan(?s rdf:type ?s_2), ?s == ?s_2)
    eq_expr, new_pattern = equality_variables(pattern['triple']['subject'],
                                              pattern['triple']['predicate'],
                                              pattern['triple']['object'])
    if eq_expr is not None:
        # copy pattern with rewritten values
        triple = pattern['triple'].copy()
        triple["subject"] = new_pattern[0]
        triple["predicate"] = new_pattern[1]
        triple["object"] = new_pattern[2]
        # build a pipline with Index Scan + Equality filter
        pipeline = ScanIterator(pattern['iterator'], triple,
                                pattern['cardinality'])
        pipeline = FilterIterator(pipeline, eq_expr)
        # update query variables
        query_vars = query_vars | get_vars(triple)
    else:
        pipeline = ScanIterator(pattern['iterator'], pattern['triple'],
                                pattern['cardinality'])

    # build the left linear tree of joins
    while len(triples) > 0:
        pattern, pos, query_vars = find_connected_pattern(query_vars, triples)
        # no connected pattern = disconnected BGP => pick the first remaining pattern in the BGP
        if pattern is None:
            pattern = triples[0]
            query_vars = query_vars | get_vars(pattern['triple'])
            pos = 0
        graph_uri = pattern['triple']['graph']
        pipeline = IndexJoinIterator(pipeline,
                                     pattern['triple'],
                                     dataset.get_graph(graph_uri),
                                     as_of=as_of)
        triples.pop(pos)
    return pipeline, query_vars, cardinalities
示例#13
0
def parse_query_node(node,
                     dataset,
                     current_graphs,
                     server_url,
                     cardinalities,
                     renaming_map=None):
    """
        Recursively parse node in the query logical plan to build a preemptable physical query execution plan.

        Args:
            * node - Node of the logical plan to parse (in rdflib format)
            * dataset - RDF dataset used to execute the query
            * current_graphs - List of IRI of the current RDF graph queried
            * server_url - URL of the SaGe server
            * cardinalities - Map<triple,integer> used to track triple patterns cardinalities
    """
    if node.name == 'SelectQuery':
        # in case of a FROM clause, set the new default graphs used
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_graph_uri(format_term(graph_iri.default), server_url)
                for graph_iri in node.datasetClause
            ]
        return parse_query_node(node.p, dataset, graphs, server_url,
                                cardinalities)
    elif node.name == 'Project':
        query_vars = list(map(lambda t: t.n3(), node.PV))
        if node.p.name == 'AggregateJoin' or node.p.name == 'Extend':
            # forward projection variables, as we need them for parsing an AggregateJoin
            node.p['PV'] = query_vars
            return parse_query_node(node.p, dataset, current_graphs,
                                    server_url, cardinalities)
        child = parse_query_node(node.p, dataset, current_graphs, server_url,
                                 cardinalities)
        return ProjectionIterator(child, dataset, current_graphs[0],
                                  query_vars)
    elif node.name == 'BGP':
        # bgp_vars = node._vars
        triples = list(localize_triple(node.triples, current_graphs))
        # format triple patterns for the backend API
        patterns = []
        for triple in triples:
            graph_uri = triple[
                'graph'] if 'graph' in triple else current_graphs[0]
            graph = dataset.get_graph(graph_uri)
            patterns.append({
                'subject':
                triple['subject'] if triple['subject'].startswith('?') else
                graph.get_identifiant(triple['subject']),
                'predicate':
                triple['predicate'] if triple['predicate'].startswith('?') else
                graph.get_identifiant(triple['predicate']),
                'object':
                triple['object'] if triple['object'].startswith('?') else
                graph.get_identifiant(triple['object']),
                'graph':
                graph_uri
            })
        iterator, query_vars, c = build_left_plan(patterns, dataset,
                                                  current_graphs)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Union':
        left = parse_query_node(node.p1, dataset, current_graphs, server_url,
                                cardinalities)
        right = parse_query_node(node.p2, dataset, current_graphs, server_url,
                                 cardinalities)
        return BagUnionIterator(left, right)
    elif node.name == 'Filter':
        expression = parse_filter_expr(node.expr)
        iterator = parse_query_node(node.p, dataset, current_graphs,
                                    server_url, cardinalities)
        return FilterIterator(iterator, expression)
    elif node.name == 'Join':
        # only allow for joining BGPs from different GRAPH clauses
        triples = fetch_graph_triples(node.p1, current_graphs, server_url)
        triples += fetch_graph_triples(node.p2, current_graphs, server_url)
        iterator, query_vars, c = build_left_plan(triples, dataset,
                                                  current_graphs)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Extend':
        # remove all extend operators, as they are not needed
        current = node
        renaming = dict()
        while current.name == 'Extend':
            renaming[current.expr.n3()] = current.var.n3()
            current = current.p
        current['PV'] = node['PV']
        return parse_query_node(current,
                                dataset,
                                current_graphs,
                                server_url,
                                cardinalities,
                                renaming_map=renaming)
    elif node.name == 'AggregateJoin':
        groupby_variables = list()
        # build GROUP BY variables
        last_groupby_var = None
        if node.p.expr is None:  # case 1: no explicit group BY, so we group by all variables in the query
            last_groupby_var = list(node.p._vars)[0]
            # for variable in node.p._vars:
            #     groupby_variables.append(variable.n3())
            #     last_groupby_var = variable
        else:  # case 2: there is an explicit group by
            for variable in node.p.expr:
                groupby_variables.append(variable.n3())
                last_groupby_var = variable
        # build aggregators for evaluating SPARQL aggregations (if any)
        aggregators = list()
        for agg in node.A:
            if agg.vars == '*':
                agg.vars = last_groupby_var
            if agg.name != 'Aggregate_Sample':
                aggregators.append(build_aggregator(dataset, agg,
                                                    renaming_map))
        # build source iterator from child node
        source = parse_query_node(node.p.p, dataset, current_graphs,
                                  server_url, cardinalities)
        # add the GROUP BY operator (with aggregators) to the pipeline
        source = GroupByAggregator(source,
                                   groupby_variables,
                                   aggregators=aggregators,
                                   max_size=dataset.max_group_by_size)
        # add the projection to the pipeline, depending of the context
        return AggregatesProjectionIterator(source, dataset, current_graphs[0],
                                            node.PV)
    else:
        raise UnsupportedSPARQL("Unsupported SPARQL feature: {}".format(
            node.name))
示例#14
0
def parse_query_alt(node: dict,
                    dataset: Dataset,
                    current_graphs: List[str],
                    cardinalities: dict,
                    as_of: Optional[datetime] = None) -> PreemptableIterator:
    """Recursively parse node in the query logical plan to build a preemptable physical query execution plan.

    Args:
      * node: Node of the logical plan to parse (in rdflib format).
      * dataset: RDF dataset used to execute the query.
      * current_graphs: List of IRI of the current RDF graphs queried.
      * cardinalities: A dict used to track triple patterns cardinalities.
      * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation.

    Returns: An iterator used to evaluate the input node.

    Throws: `UnsupportedSPARQL` is the SPARQL query contains features not supported by the SaGe query engine.
    """
    if node.name == 'SelectQuery':
        # in case of a FROM clause, set the new default graphs used
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_term(graph_iri.default)
                for graph_iri in node.datasetClause
            ]
        return parse_query_alt(node.p,
                               dataset,
                               graphs,
                               cardinalities,
                               as_of=as_of)
    elif node.name == 'ConstructQuery':
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_term(graph_iri.default)
                for graph_iri in node.datasetClause
            ]
        child = parse_query_alt(node.p,
                                dataset,
                                graphs,
                                cardinalities,
                                as_of=as_of)
        return ConstructIterator(child,
                                 convert_construct_template(node.template))
    elif node.name == 'Reduced':
        child = parse_query_alt(node.p,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        return ReducedIterator(child)
    elif node.name == 'Project':
        query_vars = list(map(lambda t: '?' + str(t), node.PV))
        child = parse_query_alt(node.p,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        return ProjectionIterator(child, query_vars)
    elif node.name == 'BGP':
        # bgp_vars = node._vars
        triples = list(localize_triples(node.triples, current_graphs))
        iterator, query_vars, c = build_left_join_tree(triples,
                                                       dataset,
                                                       current_graphs,
                                                       as_of=as_of)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Union':
        left = parse_query_alt(node.p1,
                               dataset,
                               current_graphs,
                               cardinalities,
                               as_of=as_of)
        right = parse_query_alt(node.p2,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        return BagUnionIterator(left, right)
    elif node.name == 'Filter':
        expression = parse_filter_expr(node.expr)
        iterator = parse_query_alt(node.p,
                                   dataset,
                                   current_graphs,
                                   cardinalities,
                                   as_of=as_of)
        return FilterIterator(iterator, expression)
    elif node.name == 'Extend':
        bgp_iterator = parse_query_alt(node.p,
                                       dataset,
                                       current_graphs,
                                       cardinalities,
                                       as_of=as_of)
        expression = parse_bind_expr(node.expr)
        #print("expression:"+str(expression))
        if isinstance(bgp_iterator, EmptyIterator):
            return BindIterator(None, expression, '?' + node.var)
        else:
            return BindIterator(bgp_iterator, expression, '?' + node.var)
    elif node.name == 'Join':
        left = parse_query_alt(node.p1,
                               dataset,
                               current_graphs,
                               cardinalities,
                               as_of=as_of)
        if node.p2.name == 'BGP':
            triples = list(localize_triples(node.p2.triples, current_graphs))
            variables = set(map(lambda t: t.n3(), node.p1._vars))
            #print("Join P1 _vars"+str(variables))
            iterator, query_vars, c = continue_left_join_tree(
                left, variables, triples, dataset, current_graphs)
            cardinalities += c
            return iterator
        else:
            raise UnsupportedSPARQL(
                f"Join Unsupported SPARQL feature: {node.p2.name}")
    else:
        raise UnsupportedSPARQL(f"Unsupported SPARQL feature: {node.name}")
示例#15
0
def parse_query_node(node: dict,
                     dataset: Dataset,
                     current_graphs: List[str],
                     cardinalities: dict,
                     as_of: Optional[datetime] = None) -> PreemptableIterator:
    """Recursively parse node in the query logical plan to build a preemptable physical query execution plan.

    Args:
      * node: Node of the logical plan to parse (in rdflib format).
      * dataset: RDF dataset used to execute the query.
      * current_graphs: List of IRI of the current RDF graphs queried.
      * cardinalities: A dict used to track triple patterns cardinalities.
      * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation.

    Returns: An iterator used to evaluate the input node.

    Throws: `UnsupportedSPARQL` is the SPARQL query contains features not supported by the SaGe query engine.
    """
    if node.name == 'SelectQuery':
        # in case of a FROM clause, set the new default graphs used
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_term(graph_iri.default)
                for graph_iri in node.datasetClause
            ]
        return parse_query_node(node.p,
                                dataset,
                                graphs,
                                cardinalities,
                                as_of=as_of)
    elif node.name == 'Project':
        query_vars = list(map(lambda t: '?' + str(t), node.PV))
        child = parse_query_node(node.p,
                                 dataset,
                                 current_graphs,
                                 cardinalities,
                                 as_of=as_of)
        return ProjectionIterator(child, query_vars)
    elif node.name == 'BGP':
        # bgp_vars = node._vars
        triples = list(localize_triples(node.triples, current_graphs))
        iterator, query_vars, c = build_left_join_tree(triples,
                                                       dataset,
                                                       current_graphs,
                                                       as_of=as_of)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Union':
        left = parse_query_node(node.p1,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        right = parse_query_node(node.p2,
                                 dataset,
                                 current_graphs,
                                 cardinalities,
                                 as_of=as_of)
        return BagUnionIterator(left, right)
    elif node.name == 'Filter':
        expression = parse_filter_expr(node.expr)
        iterator = parse_query_node(node.p,
                                    dataset,
                                    current_graphs,
                                    cardinalities,
                                    as_of=as_of)
        return FilterIterator(iterator, expression)
    elif node.name == 'Join':
        # only allow for joining BGPs from different GRAPH clauses
        triples = get_triples_from_graph(
            node.p1, current_graphs) + get_triples_from_graph(
                node.p2, current_graphs)
        iterator, query_vars, c = build_left_join_tree(triples, dataset,
                                                       current_graphs)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    else:
        raise UnsupportedSPARQL(f"Unsupported SPARQL feature: {node.name}")
示例#16
0
def build_left_plan(bgp, dataset, default_graph):
    """Build a Left-linear tree of joins from a BGP"""
    # gather metadata about triple patterns
    triples = []
    cardinalities = []

    # analyze each triple pattern in the BGP
    for triple in bgp:
        # select the graph used to evaluate the pattern
        graph_uri = triple['graph'] if 'graph' in triple and len(
            triple['graph']) > 0 else default_graph
        triple['graph'] = graph_uri
        # get iterator and statistics about the pattern
        if dataset.has_graph(graph_uri):
            it, c = dataset.get_graph(graph_uri).search(
                triple['subject'], triple['predicate'], triple['object'])
        else:
            it, c = EmptyIterator(), 0
        triples += [{'triple': triple, 'cardinality': c, 'iterator': it}]
        cardinalities += [{'triple': triple, 'cardinality': c}]

    # sort triples by ascending cardinality
    triples = sorted(triples, key=lambda v: v['cardinality'])

    # start the pipeline with the Scan with the most selective pattern
    pattern = triples.pop(0)
    query_vars = get_vars(pattern['triple'])

    # add a equality filter if the pattern has several variables that binds to the same value
    # example: ?s rdf:type ?s => Filter(Scan(?s rdf:type ?s_2), ?s == ?s_2)
    eq_expr, new_pattern = equality_variables(pattern['triple']['subject'],
                                              pattern['triple']['predicate'],
                                              pattern['triple']['object'])
    if eq_expr is not None:
        # copy pattern with rewritten values
        triple = pattern['triple'].copy()
        triple["subject"] = new_pattern[0]
        triple["predicate"] = new_pattern[1]
        triple["object"] = new_pattern[2]
        # build a pipline with Index Scan + Equality filter
        pipeline = ScanIterator(pattern['iterator'], triple,
                                pattern['cardinality'])
        pipeline = FilterIterator(pipeline, eq_expr)
        # update query variables
        query_vars = query_vars | get_vars(triple)
    else:
        pipeline = ScanIterator(pattern['iterator'], pattern['triple'],
                                pattern['cardinality'])

    # build the left linear tree of joins
    while len(triples) > 0:
        pattern, pos, query_vars = find_connected_pattern(query_vars, triples)
        # no connected pattern = disconnected BGP => pick the first remaining pattern in the BGP
        if pattern is None:
            pattern = triples[0]
            query_vars = query_vars | get_vars(pattern['triple'])
            pos = 0
        graph_uri = pattern['triple']['graph']
        pipeline = IndexJoinIterator(pipeline, pattern['triple'],
                                     dataset.get_graph(graph_uri))
        triples.pop(pos)
    return pipeline, query_vars, cardinalities