def path_loader(length,
                gtps_filename=GTPS_FILENAME,
                sparql_endpoint=SPARQL_ENDPOINT,
                eval_data_graph=EVAL_DATA_GRAPH,
                load=True,
                clear=True,
                **kwds):
    gp = random_path(length)
    logger.info('Generated random graph pattern with path length %d:\n%s' %
                (length, gp))

    # get list of semantic association pairs
    semantic_associations = get_semantic_associations(
        fn=gtps_filename,
        limit=None,
    )
    gtps = semantic_associations

    triples = generate_triples(gp, gtps)
    if load:
        load_triples_into_endpoint(triples,
                                   sparql_endpoint=sparql_endpoint,
                                   graph=eval_data_graph,
                                   clear=clear)

    return gp
def main():
    semantic_associations = get_semantic_associations(
        config.GT_ASSOCIATIONS_FILENAME)
    assocs_train, assocs_test = split_training_test_set(semantic_associations,
                                                        variant='random')

    # setup node expander
    sparql = SPARQLWrapper(config.SPARQL_ENDPOINT)

    predict_set = assocs_test

    for method, query in sorted(prediction_queries.items()):
        target_idxs = []
        for source, target in predict_set:
            prediction = predict_target_with_query(sparql, query, source)
            target_idxs.append(find_in_prediction(prediction, target))
        print("'%s': %s," % (method, target_idxs))
示例#3
0
def main():
    from rdflib import Variable
    gp = GraphPattern((
        (SOURCE_VAR, Variable('v1'), Variable('v2')),
        (TARGET_VAR, Variable('v3'), Variable('v2')),
    ))
    # get list of semantic association pairs and split in train and test sets
    semantic_associations = get_semantic_associations(
        fn='data/dbpedia_random_1000k_uri_pairs.csv.gz',
        limit=None,
    )
    # assocs_train, assocs_test = split_training_test_set(
    #     semantic_associations
    # )
    # stps = tuple(sorted(assocs_train))
    stps = semantic_associations

    triples = generate_triples(gp, stps)
    load_triples_into_endpoint(triples)
def main():
    semantic_associations = get_semantic_associations(
        config.GT_ASSOCIATIONS_FILENAME)
    assocs_train, assocs_test = split_training_test_set(
        semantic_associations, variant='random'
    )

    # setup node expander
    sparql = SPARQLWrapper(config.SPARQL_ENDPOINT)

    predict_set = assocs_test



    for method, query in sorted(prediction_queries.items()):
        target_idxs = []
        for source, target in predict_set:
            prediction = predict_target_with_query(sparql, query, source)
            target_idxs.append(find_in_prediction(prediction, target))
        print("'%s': %s," % (method, target_idxs))
def main():
    semantic_associations = get_semantic_associations(
        config.GT_ASSOCIATIONS_FILENAME)
    assocs_train, assocs_test = split_training_test_set(
        semantic_associations, variant='random'
    )

    # setup node expander
    sparql = SPARQLWrapper(config.SPARQL_ENDPOINT)

    predict_list = assocs_test

    # degree, pagerank and hits
    for method, query in sorted(prediction_queries.items()):
        target_idxs = []
        for source, target in predict_list:
            logger.info(
                'method: %s, predicting targets for %s, ground truth: %s',
                method, source.n3(), target.n3())
            prediction = predict_target_with_query(sparql, query, source)
            idx = find_in_prediction(prediction, target)
            logger.info(
                format_prediction_results(method, prediction, target, idx))
            target_idxs.append(idx)
        print("'%s': %s," % (method, target_idxs))

    # milne-witten relatedness
    for method, pred in (('mw_wl', 'dbo:wikiPageWikiLink'),):
        target_idxs = []
        for source, target in predict_list:
            logger.info(
                'method: %s, predicting targets for %s, ground truth: %s',
                method, source.n3(), target.n3())
            prediction = predict_target_with_milne_witten(sparql, pred, source)
            idx = find_in_prediction(prediction, target)
            logger.info(
                format_prediction_results(method, prediction, target, idx))
            target_idxs.append(idx)
        print("'%s': %s," % (method, target_idxs))
示例#6
0
def main():
    semantic_associations = get_semantic_associations(
        config.GT_ASSOCIATIONS_FILENAME)
    assocs_train, assocs_test = split_training_test_set(semantic_associations,
                                                        variant='random')

    # setup node expander
    sparql = SPARQLWrapper(config.SPARQL_ENDPOINT)

    predict_list = assocs_test

    # degree, pagerank and hits
    for method, query in sorted(prediction_queries.items()):
        target_idxs = []
        for source, target in predict_list:
            logger.info(
                'method: %s, predicting targets for %s, ground truth: %s',
                method, source.n3(), target.n3())
            prediction = predict_target_with_query(sparql, query, source)
            idx = find_in_prediction(prediction, target)
            logger.info(
                format_prediction_results(method, prediction, target, idx))
            target_idxs.append(idx)
        print("'%s': %s," % (method, target_idxs))

    # milne-witten relatedness
    for method, pred in (('mw_wl', 'dbo:wikiPageWikiLink'), ):
        target_idxs = []
        for source, target in predict_list:
            logger.info(
                'method: %s, predicting targets for %s, ground truth: %s',
                method, source.n3(), target.n3())
            prediction = predict_target_with_milne_witten(sparql, pred, source)
            idx = find_in_prediction(prediction, target)
            logger.info(
                format_prediction_results(method, prediction, target, idx))
            target_idxs.append(idx)
        print("'%s': %s," % (method, target_idxs))
def main():
    from rdflib import Variable
    # the following triple will timeout if vars_joint was 0:
    # ?s a owl:Thing . t? a owl:Thing .
    gp = GraphPattern((
        (SOURCE_VAR, Variable('v1'), Variable('v2')),
        (TARGET_VAR, Variable('v3'), Variable('v2')),
    ))
    # get list of semantic association pairs and split in train and test sets
    semantic_associations = get_semantic_associations(
        fn='data/dbpedia_random_1000_uri_pairs.csv.gz',
        limit=100,
    )
    # assocs_train, assocs_test = split_training_test_set(
    #     semantic_associations
    # )
    # stps = tuple(sorted(assocs_train))
    stps = semantic_associations
    print(len(stps))

    triples = generate_triples(gp, stps)
    for t in triples:
        print(t)
def main():
    from rdflib import Variable
    # the following triple will timeout if vars_joint was 0:
    # ?s a owl:Thing . t? a owl:Thing .
    gp = GraphPattern((
        (SOURCE_VAR, Variable('v1'), Variable('v2')),
        (TARGET_VAR, Variable('v3'), Variable('v2')),
    ))
    # get list of semantic association pairs and split in train and test sets
    semantic_associations = get_semantic_associations(
        fn='data/dbpedia_random_1000_uri_pairs.csv.gz',
        limit=100,
    )
    # assocs_train, assocs_test = split_training_test_set(
    #     semantic_associations
    # )
    # stps = tuple(sorted(assocs_train))
    stps = semantic_associations
    print(len(stps))

    triples = generate_triples(gp, stps)
    for t in triples:
        print(t)
def main():
    from rdflib import Variable
    # gp = GraphPattern((
    #     (SOURCE_VAR, Variable('v1'), Variable('v2')),
    #     (TARGET_VAR, Variable('v3'), Variable('v2')),
    # ))
    gp = GraphPattern((
        (Variable('v1'), Variable('v2'), SOURCE_VAR),
        (Variable('v1'), Variable('v3'), Variable('v4')),
        (Variable('v4'), Variable('v5'), TARGET_VAR),
    ))
    # get list of semantic association pairs and split in train and test sets
    semantic_associations = get_semantic_associations(
        fn='data/dbpedia_random_1000_uri_pairs.csv.gz',
        limit=None,
    )
    # assocs_train, assocs_test = split_training_test_set(
    #     semantic_associations
    # )
    # stps = tuple(sorted(assocs_train))
    stps = semantic_associations

    triples = generate_triples(gp, stps)
    load_triples_into_endpoint(triples)
from gp_learner import mutate_increase_dist
from gp_learner import mutate_merge_var
from gp_learner import mutate_simplify_pattern
from graph_pattern import GraphPattern
from graph_pattern import SOURCE_VAR
from graph_pattern import TARGET_VAR
from ground_truth_tools import get_semantic_associations
from ground_truth_tools import split_training_test_set
from gtp_scores import GTPScores

logger = logging.getLogger(__name__)

dbp = rdflib.Namespace('http://dbpedia.org/resource/')
wikilink = URIRef('http://dbpedia.org/ontology/wikiPageWikiLink')

ground_truth_pairs = get_semantic_associations()
ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs)
gtp_scores = GTPScores(ground_truth_pairs)


def test_mutate_increase_dist():
    gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)])
    res = mutate_increase_dist(gp)
    assert gp != res
    assert gp.diameter() + 1 == res.diameter()
    assert gp.vars_in_graph == {SOURCE_VAR, TARGET_VAR}


def test_mutate_merge_var():
    p = Variable('p')
    q = Variable('q')
def main(**kwds):
    from eval.enumerate import load_pattern
    from eval.random_path_loader import path_loader
    from ground_truth_tools import get_semantic_associations
    from utils import log_all_exceptions

    logging.info('encoding check: äöüß🎅')  # logging utf-8 byte string
    logging.info(u'encoding check: äöüß\U0001F385')  # logging unicode string
    logging.info(u'encoding check: äöüß\U0001F385'.encode('utf-8'))  # convert
    print('encoding check: äöüß🎅')  # printing utf-8 byte string
    print(u'encoding check: äöüß\U0001F385')  # printing unicode string


    if kwds['method'] == 'random_path':
        # inject triples for a random path of given length into endpoint
        eval_gp = path_loader(**kwds)
        result_filename = 'path_length_eval_result.txt'
    elif kwds['method'] == 'enum':
        from eval.data_generator import generate_triples
        from eval.data_loader import load_triples_into_endpoint
        seq = int(os.getenv('SEQ_NUMBER'))  # see script/run_multi_range.sh
        eval_gp = load_pattern(kwds['length'], seq)
        logger.info(
            'Loaded enumerated graph pattern number %d with length %d:\n%s' % (
                seq, kwds['length'], eval_gp))

        # get list of semantic association pairs
        gtps = get_semantic_associations(
            fn=kwds['GT_ASSOCIATIONS_FILENAME'],
            limit=None,
        )

        triples = generate_triples(eval_gp, gtps)
        load_triples_into_endpoint(
            triples,
            sparql_endpoint=kwds['SPARQL_ENDPOINT'],
            graph=kwds['eval_data_graph'],
        )
        result_filename = 'enum_eval_result.txt'
    else:
        raise NotImplementedError(kwds['method'])

    sparql_endpoint = kwds['sparql_endpoint']
    gtps_filename = kwds['gtps_filename']
    length = kwds['length']

    gtps = tuple(sorted(
        get_semantic_associations(gtps_filename)))
    # for s, t in gtps:
    #     print(curify(s))
    #     print(curify(t))
    #     print('')

    sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint)

    tic = datetime.utcnow()
    # noinspection PyBroadException
    try:
        pattern_found = log_all_exceptions(logger)(_main)(sparql, gtps)
        return_code = 0 if pattern_found else 1
        tac = datetime.utcnow()
        logger.info(
            "search for pattern took %s and was %s",
            tac - tic,
            'successful' if pattern_found else 'unsuccessful'
        )
    except Exception:
        tac = datetime.utcnow()
        logger.exception(
            "search for pattern took %s and was aborted due to exception",
            tac - tic,
        )
        return_code = 2

    # return code's 0 is success, turn into more intuitive encoding for file
    res = {0: 1, 1: 0, 2: -1}[return_code]

    fn = path.join(config.RESDIR, result_filename)
    with open(make_dirs_for(fn), 'a') as f:
        f.write(
            'len: %d, result: %d, took: %.1f s, end (UTC): %s\n'
            'eval %s\n\n' % (
                length, res, timedelta_to_s(tac - tic), datetime.utcnow(),
                eval_gp
            )
        )
    sys.exit(return_code)
# encoding: utf-8
import logging

from ground_truth_tools import get_semantic_associations
from ground_truth_tools import split_training_test_set
from ground_truth_tools import k_fold_cross_validation

logger = logging.getLogger(__name__)

associations = get_semantic_associations()


def test_split_train_test_set():
    vr = split_training_test_set(associations)
    train, test = vr
    logger.info("just random: train: %d, test: %d", len(train), len(test))
    vtnd = split_training_test_set(associations, variant='target_node_disjoint')
    train, test = vtnd
    logger.info("target node disjoint: train: %d, test: %d",
                len(train), len(test))
    vnd = split_training_test_set(associations, variant='node_disjoint')
    train, test = vnd
    logger.info("node disjoint: train: %d, test: %d", len(train), len(test))

    assert vr[0] == vtnd[0] == vnd[0], \
        "train set shouldn't be influenced by different splitting variant"
    assert set(vr[1]) > set(vtnd[1]) > set(vnd[1]), \
        "test set expected to shrink for more restrictive splitting variants"


def test_k_fold_cross_validation():
from gp_learner import mutate_increase_dist
from gp_learner import mutate_merge_var
from gp_learner import mutate_simplify_pattern
from graph_pattern import GraphPattern
from graph_pattern import SOURCE_VAR
from graph_pattern import TARGET_VAR
from ground_truth_tools import get_semantic_associations
from ground_truth_tools import split_training_test_set
from gtp_scores import GTPScores

logger = logging.getLogger(__name__)

dbp = rdflib.Namespace('http://dbpedia.org/resource/')
wikilink = URIRef('http://dbpedia.org/ontology/wikiPageWikiLink')

ground_truth_pairs = get_semantic_associations()
ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs)
gtp_scores = GTPScores(ground_truth_pairs)


def test_mutate_increase_dist():
    gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)])
    res = mutate_increase_dist(gp)
    assert gp != res
    assert gp.diameter() + 1 == res.diameter()
    assert gp.vars_in_graph == {SOURCE_VAR, TARGET_VAR}


def test_mutate_merge_var():
    p = Variable('p')
    q = Variable('q')
示例#14
0
def main(**kwds):
    from eval.enumerate import load_pattern
    from eval.random_path_loader import path_loader
    from ground_truth_tools import get_semantic_associations
    from utils import log_all_exceptions

    logging.info('encoding check: äöüß🎅')  # logging utf-8 byte string
    logging.info(
        u'encoding check: äöüß\U0001F385')  # logging unicode string
    logging.info(
        u'encoding check: äöüß\U0001F385'.encode('utf-8'))  # convert
    print('encoding check: äöüß🎅')  # printing utf-8 byte string
    print(u'encoding check: äöüß\U0001F385')  # printing unicode string

    if kwds['method'] == 'random_path':
        # inject triples for a random path of given length into endpoint
        eval_gp = path_loader(**kwds)
        result_filename = 'path_length_eval_result.txt'
    elif kwds['method'] == 'enum':
        from eval.data_generator import generate_triples
        from eval.data_loader import load_triples_into_endpoint
        seq = int(os.getenv('SEQ_NUMBER'))  # see script/run_multi_range.sh
        eval_gp = load_pattern(kwds['length'], seq)
        logger.info(
            'Loaded enumerated graph pattern number %d with length %d:\n%s' %
            (seq, kwds['length'], eval_gp))

        # get list of semantic association pairs
        gtps = get_semantic_associations(
            fn=kwds['GT_ASSOCIATIONS_FILENAME'],
            limit=None,
        )

        triples = generate_triples(eval_gp, gtps)
        load_triples_into_endpoint(
            triples,
            sparql_endpoint=kwds['SPARQL_ENDPOINT'],
            graph=kwds['eval_data_graph'],
        )
        result_filename = 'enum_eval_result.txt'
    else:
        raise NotImplementedError(kwds['method'])

    sparql_endpoint = kwds['sparql_endpoint']
    gtps_filename = kwds['gtps_filename']
    length = kwds['length']

    gtps = tuple(sorted(get_semantic_associations(gtps_filename)))
    # for s, t in gtps:
    #     print(curify(s))
    #     print(curify(t))
    #     print('')

    sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint)

    tic = datetime.utcnow()
    # noinspection PyBroadException
    try:
        pattern_found = log_all_exceptions(logger)(_main)(sparql, gtps)
        return_code = 0 if pattern_found else 1
        tac = datetime.utcnow()
        logger.info("search for pattern took %s and was %s", tac - tic,
                    'successful' if pattern_found else 'unsuccessful')
    except Exception:
        tac = datetime.utcnow()
        logger.exception(
            "search for pattern took %s and was aborted due to exception",
            tac - tic,
        )
        return_code = 2

    # return code's 0 is success, turn into more intuitive encoding for file
    res = {0: 1, 1: 0, 2: -1}[return_code]

    fn = path.join(config.RESDIR, result_filename)
    with open(make_dirs_for(fn), 'a') as f:
        f.write('len: %d, result: %d, took: %.1f s, end (UTC): %s\n'
                'eval %s\n\n' % (length, res, timedelta_to_s(tac - tic),
                                 datetime.utcnow(), eval_gp))
    sys.exit(return_code)
# encoding: utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging

from ground_truth_tools import get_semantic_associations
from ground_truth_tools import split_training_test_set
from ground_truth_tools import k_fold_cross_validation

logger = logging.getLogger(__name__)

associations = get_semantic_associations()


def test_split_train_test_set():
    vr = split_training_test_set(associations)
    train, test = vr
    logger.info("just random: train: %d, test: %d", len(train), len(test))
    vtnd = split_training_test_set(associations, variant='target_node_disjoint')
    train, test = vtnd
    logger.info("target node disjoint: train: %d, test: %d",
                len(train), len(test))
    vnd = split_training_test_set(associations, variant='node_disjoint')
    train, test = vnd
    logger.info("node disjoint: train: %d, test: %d", len(train), len(test))

    assert vr[0] == vtnd[0] == vnd[0], \
        "train set shouldn't be influenced by different splitting variant"
    assert set(vr[1]) > set(vtnd[1]) > set(vnd[1]), \