def test_graph_pattern_canonicalization():
    # test for bug in lib:
    # rdflib.compare.to_canonical_graph(g) sometimes collapses distinct bnodes
    # see https://github.com/RDFLib/rdflib/issues/494
    # The GraphPattern below causes such a problem, currently we return gp
    # itself instead of a canonical representation of it. We just test the len
    # in case it's fixed in rdflib.
    gp = GraphPattern(
        ((SOURCE_VAR, Variable('vcb0'),
          TARGET_VAR), (SOURCE_VAR, Variable('vrBYUk8'), TARGET_VAR),
         (TARGET_VAR, Variable('vrBYUk8'),
          SOURCE_VAR), (TARGET_VAR, Variable('vrvGapn'), SOURCE_VAR)))
    cgp = canonicalize(gp)
    assert len(gp) == len(cgp)

    # test for a bug in canonicalization when it didn't rewrite fixed
    gp = GraphPattern((
        (TARGET_VAR, Variable('v0'), SOURCE_VAR),
        (TARGET_VAR, Variable('v0'), Variable('v1')),
        (TARGET_VAR, Variable('v2'), Variable('v1')),
        (TARGET_VAR, Variable('v2'), Variable('v3')),
        (TARGET_VAR, Variable('v4'), Variable('v5')),
    ))
    cgp = canonicalize(gp)
    assert len(gp) == len(cgp)
def test_graph_pattern_canonicalization():
    # test for bug in lib:
    # rdflib.compare.to_canonical_graph(g) sometimes collapses distinct bnodes
    # see https://github.com/RDFLib/rdflib/issues/494
    # The GraphPattern below causes such a problem, currently we return gp
    # itself instead of a canonical representation of it. We just test the len
    # in case it's fixed in rdflib.
    gp = GraphPattern((
        (SOURCE_VAR, Variable('vcb0'), TARGET_VAR),
        (SOURCE_VAR, Variable('vrBYUk8'), TARGET_VAR),
        (TARGET_VAR, Variable('vrBYUk8'), SOURCE_VAR),
        (TARGET_VAR, Variable('vrvGapn'), SOURCE_VAR)))
    cgp = canonicalize(gp)
    assert len(gp) == len(cgp)

    # test for a bug in canonicalization when it didn't rewrite fixed
    gp = GraphPattern((
        (TARGET_VAR, Variable('v0'), SOURCE_VAR),
        (TARGET_VAR, Variable('v0'), Variable('v1')),
        (TARGET_VAR, Variable('v2'), Variable('v1')),
        (TARGET_VAR, Variable('v2'), Variable('v3')),
        (TARGET_VAR, Variable('v4'), Variable('v5')),
    ))
    cgp = canonicalize(gp)
    assert len(gp) == len(cgp)
示例#3
0
def main():
    length = 3
    gen_patterns = list(pattern_generator(length))
    for n, (i, pattern) in enumerate(gen_patterns):
        print('%d: Pattern id %d: %s' % (n, i, pattern))
    patterns = set(gp for pid, gp in gen_patterns[:-1])

    # testing flipped edges
    for gp in patterns:
        for i in range(length):
            mod_gp = gp.flip_edge(i)
            # can happen that flipped edge was there already
            if len(mod_gp) == length:
                assert canonicalize(mod_gp) in patterns
示例#4
0
def main():
    length = 3
    gen_patterns = list(pattern_generator(length))
    for n, (i, pattern) in enumerate(gen_patterns):
        print('%d: Pattern id %d: %s' % (n, i, pattern))
    patterns = set(gp for pid, gp in gen_patterns[:-1])

    # testing flipped edges
    for gp in patterns:
        for i in range(length):
            mod_gp = gp.flip_edge(i)
            # can happen that flipped edge was there already
            if len(mod_gp) == length:
                assert canonicalize(mod_gp) in patterns
示例#5
0
def pattern_generator(length, loops=True, exclude_isomorphic=True):
    canonicalized_patterns = {}
    possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
    possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
    possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [(s, p, o) for s in possible_nodes
                        for p in possible_edges for o in possible_nodes]

    n_patterns = binom(len(possible_triples), length)
    logger.info('generating %d possible patterns of length %d', n_patterns,
                length)

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug('excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges)

        # check there are no skipped nodes, e.g., link to n2 picked but no n1
        if nodes != possible_var_nodes[:len(nodes)]:
            logger.debug('excluded %d: skipped node: %s', pid, gp)
            continue
        if edges != possible_edges[:len(edges)]:
            logger.debug('excluded %d: skipped edge: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected():
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid,
                             canonicalized_patterns[cgp][0], gp,
                             canonicalized_patterns[cgp][1])
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info('found %d differing patterns out of %d possible of length %d',
                i, n_patterns, length)
    yield (n_patterns, None)
示例#6
0
def main(length=4):
    # len | pcon | nej | all          | candidates (all)  | candidates (all)  |
    #     |      |     | (canonical)  | (old method)      | (numerical)       |
    # ----+------+-----+--------------+-------------------+-------------------+
    #   1 |    8 |  12 |           12 |                27 |                12 |
    #   2 |  146 | 469 |          693 |              7750 |              1314 |
    #   3 |      |     |        47478 |           6666891 |            151534 |
    #   4 |      |     |              |       11671285626 |          20884300 |
    #   5 |      |     |              |    34549552710596 |        3461471628 |

    # len | typical     | candidates     | candidates  |
    #     | (canonical) | (old method)   | (numerical) |
    # ----+-------------+----------------+-------------+
    #   1 |           2 |              4 |           2 |
    #   2 |          28 |            153 |          54 |
    #   3 |         486 |          17296 |        1614 |
    #   4 |       10374 |        3921225 |       59654 |
    #   5 |             |     1488847536 |     2707960 |

    # typical above means none of (loops, nej, pcon, source_target_edges)

    canonical = True

    _patterns = set()
    n = -1
    i = 0

    pg = patterns(
        length,
        loops=False,
        node_edge_joint=False,
        p_only_connected=False,
        source_target_edges=False,
        exclude_isomorphic=canonical and not scoop.IS_RUNNING,
        count_candidates_only=False,
    )
    f = gzip.open(
        path.join('data', 'enumerated_patterns_len%d.jsonl.gz' % length), 'w')

    if canonical and scoop.IS_RUNNING:
        # Graph pattern isomorphism checking is what takes by far the longest.
        # run canonicalization in parallel
        # chunks used for efficiency and to hinder parallel_map from trying to
        # eat up all candidates first
        for chunk in chunker(pg, 100000):
            cgps = parallel_map(
                lambda res: (res[0], canonicalize(res[1]) if res[1] else None),
                chunk
            )
            for i, pattern in cgps:
                if pattern not in _patterns:
                    n += 1
                    print('%d: Pattern id %d: %s' % (n, i, pattern))
                    assert pattern is None or len(pattern) == length, \
                        'pattern too short: %s' % (pattern,)
                    _patterns.add(pattern)
                    f.write(_jsonify(pattern))
    else:
        # run potential canonicalization inline
        for n, (i, pattern) in enumerate(pg):
            print('%d: Pattern id %d: %s' % (n, i, pattern))
            _patterns.add(pattern)
            f.write(_jsonify(pattern))
    # last res of pg is (i, None)
    _patterns.remove(None)
    print('Number of pattern candidates: %d' % i)
    print('Number of patterns: %d' % n)

    # testing flipped edges (only works if we're working with canonicals)
    if canonical:
        mod_gps = []
        for gp in _patterns:
            for i in range(length):
                mod_gp = gp.flip_edge(i)
                # can happen that flipped edge was there already
                if len(mod_gp) == length:
                    mod_gps.append(mod_gp)
        cmod_pgs = parallel_map(
            canonicalize,
            mod_gps
        )
        for i, cmod_pg in enumerate(cmod_pgs):
            assert cmod_pg in _patterns, \
                'not in patterns: mod_gp: %scanon: %s_patterns: %r...' % (
                    mod_gps[i], cmod_pg, list(_patterns)[:20]
                )
示例#7
0
def pattern_generator(
        length,
        loops=True,
        node_edge_joint=True,
        p_only_connected=True,
        source_target_edges=True,
        exclude_isomorphic=True,
        count_candidates_only=False,
):
    assert not source_target_edges or node_edge_joint, \
        'source_target_edges cannot be used without node_edge_joint'
    canonicalized_patterns = {}

    if node_edge_joint:
        # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples.
        # The first can be 3 different ones (including ?source and ?target, then
        # in each of the following triples at least one var has to be an old one
        possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)]
        possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR]
        if source_target_edges:
            possible_edges = possible_nodes
        else:
            possible_edges = possible_vars
    else:
        possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
        possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
        possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [
        (s, p, o)
        for s in possible_nodes
        for p in possible_edges
        for o in possible_nodes
    ]

    n_patterns = binom(len(possible_triples), length)
    logger.info(
        'generating %d possible patterns of length %d', n_patterns, length)
    if count_candidates_only:
        yield (n_patterns, None)
        return

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug(
                'excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR})
        vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR})

        # check there are no skipped variables (nodes or edges)
        # noinspection PyUnboundLocalVariable
        if (
                (node_edge_joint and vars_ != possible_vars[:len(vars_)]) or
                (not node_edge_joint and (
                    nodes != possible_var_nodes[:len(nodes)] or
                    edges != possible_edges[:len(edges)]
                ))
        ):
            logger.debug('excluded %d: skipped var: %s', pid, gp)
            continue

        # check if nodes and edges are disjoint
        if not node_edge_joint and (gp.nodes & gp.edges):
            logger.debug('excluded %d: node-edge-joined: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected(via_edges=p_only_connected):
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug(
                    'excluded %d: isomorphic to %d:\n%sand\n%s',
                    pid,
                    canonicalized_patterns[cgp][0],
                    gp,
                    canonicalized_patterns[cgp][1]
                )
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info(
        'found %d differing patterns out of %d possible of length %d',
        i, n_patterns, length
    )
    yield (n_patterns, None)
示例#8
0
def patterns(
        length,
        loops=True,
        node_edge_joint=True,
        p_only_connected=True,
        source_target_edges=True,
        exclude_isomorphic=True,
        count_candidates_only=False,
):
    """Takes a numerical pattern and generates actual patterns from it."""
    assert not count_candidates_only or not exclude_isomorphic, \
        'count_candidates_only cannot be used with isomorphism check'
    assert not source_target_edges or node_edge_joint, \
        'source_target_edges cannot be used without node_edge_joint'

    canonicalized_patterns = {}

    pid = -1
    for c, num_pat in enumerate(numerical_patterns(
            length,
            loops=loops,
            node_edge_joint=node_edge_joint,
    )):
        assert(len(num_pat)) == length, 'too short: %s' % (num_pat,)
        flat_num_pat = [v for t in num_pat for v in t]
        all_numbers = set(flat_num_pat)

        if not p_only_connected:
            # Numerical patterns are always connected, but they might be
            # p_only_connected (e.g., 123 425).
            # Check that the pattern isn't p_only_connected, meaning that it's
            # also connected by nodes (e.g., 123 325).
            # Note that in case of node_edge_joint 123 245 is also considered
            # p_only_connected.
            if not nx.is_connected(to_nx_graph(num_pat)):
                logger.debug('excluded %d: not node connected:\n%s', c, num_pat)
                continue

        if source_target_edges:
            all_numbers = sorted(all_numbers)
            numbers = all_numbers
        else:
            numbers = sorted(all_numbers - set(flat_num_pat[1::3]))
            all_numbers = sorted(all_numbers)

        if count_candidates_only:
            l = len(numbers)
            perms = l * (l-1)
            pid += perms
            # yield pid, None  # way slower, rather show progress from here:
            if c % 100000 == 0:
                logger.info(
                    'pattern id: %d, vars: %d, permutations: %d',
                    pid, l, perms
                )
            continue

        for s, t in permutations(numbers, 2):
            pid += 1
            # source and target are mapped to numbers s and t
            # re-enumerate the leftover numbers to close "holes"
            leftover_numbers = [n for n in all_numbers if n != s and n != t]
            var_map = {n: Variable('v%d' % i)
                       for i, n in enumerate(leftover_numbers)}
            var_map[s] = SOURCE_VAR
            var_map[t] = TARGET_VAR
            gp = GraphPattern(
                tuple([tuple([var_map[i] for i in trip]) for trip in num_pat]))
            assert len(gp) == length, \
                'gp too short: num %s\n%s' % (num_pat, gp)

            # exclude patterns which are isomorphic to already generated ones
            if exclude_isomorphic:
                cgp = canonicalize(gp)
                if cgp in canonicalized_patterns:
                    igp = canonicalized_patterns[cgp]
                    igp_numpat, igp_s, igp_t, igp_gp = igp
                    logger.debug(
                        'excluded isomorphic %s with ?s=%d, ?t=%d:\n'
                        'isomorphic to %s with ?s=%d, ?t=%d:\n'
                        '%sand\n%s',
                        num_pat, s, t,
                        igp_numpat, igp_s, igp_t,
                        gp, igp_gp,
                    )
                    continue
                else:
                    canonicalized_patterns[cgp] = (num_pat, s, t, gp)
                    gp = cgp
            yield pid, gp
    yield pid + 1, None
def main():
    # len | pcon | nej | all          | candidates (all)  | candidates (all)  |
    #     |      |     | (canonical)  | (old method)      | (numerical)       |
    # ----+------+-----+--------------+-------------------+-------------------+
    #   1 |    8 |  12 |           12 |                27 |                12 |
    #   2 |  146 | 469 |          693 |              7750 |              1314 |
    #   3 |      |     |        47478 |           6666891 |            151534 |
    #   4 |      |     |              |       11671285626 |          20884300 |
    #   5 |      |     |              |    34549552710596 |        3461471628 |

    # len | typical     | candidates     | candidates  |
    #     | (canonical) | (old method)   | (numerical) |
    # ----+-------------+----------------+-------------+
    #   1 |           2 |              4 |           2 |
    #   2 |          28 |            153 |          54 |
    #   3 |         486 |          17296 |        1614 |
    #   4 |       10374 |        3921225 |       59654 |
    #   5 |             |     1488847536 |     2707960 |

    # typical above means none of (loops, nej, pcon, source_target_edges)

    length = 5
    canonical = True

    _patterns = set()
    n = -1
    i = 0

    pg = patterns(
        length,
        loops=False,
        node_edge_joint=False,
        p_only_connected=False,
        source_target_edges=False,
        exclude_isomorphic=canonical and not scoop.IS_RUNNING,
        count_candidates_only=False,
    )

    if canonical and scoop.IS_RUNNING:
        # Graph pattern isomorphism checking is what takes by far the longest.
        # run canonicalization in parallel
        # chunks used for efficiency and to hinder parallel_map from trying to
        # eat up all candidates first
        for chunk in chunker(pg, 100000):
            cgps = parallel_map(
                lambda res: (res[0], canonicalize(res[1])
                             if res[1] else None), chunk)
            for i, pattern in cgps:
                if pattern not in _patterns:
                    n += 1
                    print('%d: Pattern id %d: %s' % (n, i, pattern))
                    assert pattern is None or len(pattern) == length, \
                        'pattern too short: %s' % (pattern,)
                    _patterns.add(pattern)
    else:
        # run potential canonicalization inline
        for n, (i, pattern) in enumerate(pg):
            print('%d: Pattern id %d: %s' % (n, i, pattern))
            _patterns.add(pattern)
    # last res of pg is (i, None)
    _patterns.remove(None)
    print('Number of pattern candidates: %d' % i)
    print('Number of patterns: %d' % n)

    # testing flipped edges (only works if we're working with canonicals)
    if canonical:
        mod_gps = []
        for gp in _patterns:
            for i in range(length):
                mod_gp = gp.flip_edge(i)
                # can happen that flipped edge was there already
                if len(mod_gp) == length:
                    mod_gps.append(mod_gp)
        cmod_pgs = parallel_map(canonicalize, mod_gps)
        for i, cmod_pg in enumerate(cmod_pgs):
            assert cmod_pg in _patterns, \
                'not in patterns: mod_gp: %scanon: %s_patterns: %r...' % (
                    mod_gps[i], cmod_pg, list(_patterns)[:20]
                )
def pattern_generator(
    length,
    loops=True,
    node_edge_joint=True,
    p_only_connected=True,
    source_target_edges=True,
    exclude_isomorphic=True,
    count_candidates_only=False,
):
    assert not source_target_edges or node_edge_joint, \
        'source_target_edges cannot be used without node_edge_joint'
    canonicalized_patterns = {}

    if node_edge_joint:
        # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples.
        # The first can be 3 different ones (including ?source and ?target, then
        # in each of the following triples at least one var has to be an old one
        possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)]
        possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR]
        if source_target_edges:
            possible_edges = possible_nodes
        else:
            possible_edges = possible_vars
    else:
        possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
        possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
        possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [(s, p, o) for s in possible_nodes
                        for p in possible_edges for o in possible_nodes]

    n_patterns = binom(len(possible_triples), length)
    logger.info('generating %d possible patterns of length %d', n_patterns,
                length)
    if count_candidates_only:
        yield (n_patterns, None)
        return

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug('excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR})
        vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR})

        # check there are no skipped variables (nodes or edges)
        # noinspection PyUnboundLocalVariable
        if ((node_edge_joint and vars_ != possible_vars[:len(vars_)])
                or (not node_edge_joint and
                    (nodes != possible_var_nodes[:len(nodes)]
                     or edges != possible_edges[:len(edges)]))):
            logger.debug('excluded %d: skipped var: %s', pid, gp)
            continue

        # check if nodes and edges are disjoint
        if not node_edge_joint and (gp.nodes & gp.edges):
            logger.debug('excluded %d: node-edge-joined: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected(via_edges=p_only_connected):
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid,
                             canonicalized_patterns[cgp][0], gp,
                             canonicalized_patterns[cgp][1])
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info('found %d differing patterns out of %d possible of length %d',
                i, n_patterns, length)
    yield (n_patterns, None)
def patterns(
    length,
    loops=True,
    node_edge_joint=True,
    p_only_connected=True,
    source_target_edges=True,
    exclude_isomorphic=True,
    count_candidates_only=False,
):
    """Takes a numerical pattern and generates actual patterns from it."""
    assert not count_candidates_only or not exclude_isomorphic, \
        'count_candidates_only cannot be used with isomorphism check'
    assert not source_target_edges or node_edge_joint, \
        'source_target_edges cannot be used without node_edge_joint'

    canonicalized_patterns = {}

    pid = -1
    for c, num_pat in enumerate(
            numerical_patterns(
                length,
                loops=loops,
                node_edge_joint=node_edge_joint,
            )):
        assert (len(num_pat)) == length, 'too short: %s' % (num_pat, )
        flat_num_pat = [v for t in num_pat for v in t]
        all_numbers = set(flat_num_pat)

        if not p_only_connected:
            # Numerical patterns are always connected, but they might be
            # p_only_connected (e.g., 123 425).
            # Check that the pattern isn't p_only_connected, meaning that it's
            # also connected by nodes (e.g., 123 325).
            # Note that in case of node_edge_joint 123 245 is also considered
            # p_only_connected.
            if not nx.is_connected(to_nx_graph(num_pat)):
                logger.debug('excluded %d: not node connected:\n%s', c,
                             num_pat)
                continue

        if source_target_edges:
            all_numbers = sorted(all_numbers)
            numbers = all_numbers
        else:
            numbers = sorted(all_numbers - set(flat_num_pat[1::3]))
            all_numbers = sorted(all_numbers)

        if count_candidates_only:
            l = len(numbers)
            perms = l * (l - 1)
            pid += perms
            # yield pid, None  # way slower, rather show progress from here:
            if c % 100000 == 0:
                logger.info('pattern id: %d, vars: %d, permutations: %d', pid,
                            l, perms)
            continue

        for s, t in permutations(numbers, 2):
            pid += 1
            # source and target are mapped to numbers s and t
            # re-enumerate the leftover numbers to close "holes"
            leftover_numbers = [n for n in all_numbers if n != s and n != t]
            var_map = {
                n: Variable('v%d' % i)
                for i, n in enumerate(leftover_numbers)
            }
            var_map[s] = SOURCE_VAR
            var_map[t] = TARGET_VAR
            gp = GraphPattern(
                tuple([tuple([var_map[i] for i in trip]) for trip in num_pat]))
            assert len(gp) == length, \
                'gp too short: num %s\n%s' % (num_pat, gp)

            # exclude patterns which are isomorphic to already generated ones
            if exclude_isomorphic:
                cgp = canonicalize(gp)
                if cgp in canonicalized_patterns:
                    igp = canonicalized_patterns[cgp]
                    igp_numpat, igp_s, igp_t, igp_gp = igp
                    logger.debug(
                        'excluded isomorphic %s with ?s=%d, ?t=%d:\n'
                        'isomorphic to %s with ?s=%d, ?t=%d:\n'
                        '%sand\n%s',
                        num_pat,
                        s,
                        t,
                        igp_numpat,
                        igp_s,
                        igp_t,
                        gp,
                        igp_gp,
                    )
                    continue
                else:
                    canonicalized_patterns[cgp] = (num_pat, s, t, gp)
                    gp = cgp
            yield pid, gp
    yield pid + 1, None
示例#12
0
def pattern_generator(length, loops=True, exclude_isomorphic=True):
    canonicalized_patterns = {}
    possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
    possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
    possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [
        (s, p, o)
        for s in possible_nodes
        for p in possible_edges
        for o in possible_nodes
    ]

    n_patterns = binom(len(possible_triples), length)
    logger.info(
        'generating %d possible patterns of length %d', n_patterns, length)

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug(
                'excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges)

        # check there are no skipped nodes, e.g., link to n2 picked but no n1
        if nodes != possible_var_nodes[:len(nodes)]:
            logger.debug('excluded %d: skipped node: %s', pid, gp)
            continue
        if edges != possible_edges[:len(edges)]:
            logger.debug('excluded %d: skipped edge: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected():
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug(
                    'excluded %d: isomorphic to %d:\n%sand\n%s',
                    pid,
                    canonicalized_patterns[cgp][0],
                    gp,
                    canonicalized_patterns[cgp][1]
                )
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info(
        'found %d differing patterns out of %d possible of length %d',
        i, n_patterns, length
    )
    yield (n_patterns, None)