Пример #1
0
def _predict(source):
    from gp_query import calibrate_query_timeout
    from predict import predict
    timeout = TIMEOUT if TIMEOUT > 0 else calibrate_query_timeout(SPARQL)
    return predict(
        SPARQL, timeout, GPS, source,
        FUSION_METHODS, MAX_RESULTS, MAX_TARGET_CANDIDATES_PER_GP)
from graph_pattern import GraphPattern
from graph_pattern import SOURCE_VAR
from graph_pattern import TARGET_VAR
from ground_truth_tools import get_semantic_associations
from ground_truth_tools import split_training_test_set
from gtp_scores import GTPScores

logger = logging.getLogger(__name__)

a = URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
dbp = rdflib.Namespace('http://dbpedia.org/resource/')
wikilink = URIRef('http://dbpedia.org/ontology/wikiPageWikiLink')

sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT)
try:
    timeout = max(5, calibrate_query_timeout(sparql))  # 5s for warmup
except IOError:
    from nose import SkipTest
    raise SkipTest("Can't establish connection to SPARQL_ENDPOINT:\n    %s\n"
                   "Skipping tests in\n    %s" % (SPARQL_ENDPOINT, __file__))
ground_truth_pairs = get_semantic_associations()
ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs)
gtp_scores = GTPScores(ground_truth_pairs)


def test_evaluate():
    gp = GraphPattern(
        ((SOURCE_VAR, wikilink, TARGET_VAR),
         (SOURCE_VAR, a, URIRef('http://dbpedia.org/ontology/PopulatedPlace')),
         (TARGET_VAR, a, URIRef('http://schema.org/Country'))))
    res = evaluate(sparql, timeout, gtp_scores, gp)
Пример #3
0
def _predict(source):
    from gp_query import calibrate_query_timeout
    from predict import predict
    timeout = TIMEOUT if TIMEOUT > 0 else calibrate_query_timeout(SPARQL)
    return predict(SPARQL, timeout, GPS, source, FUSION_METHODS, MAX_RESULTS,
                   MAX_TARGET_CANDIDATES_PER_GP)
Пример #4
0
def main(
        resdir,
        sparql_endpoint,
        max_queries,
        clustering_variant,
        fusion_methods,
        timeout,
        max_results,
        max_target_candidates_per_gp,
        batch_predict,
        drop_bad_uris,
        **_  # gulp remaining kwargs
):
    from gp_query import calibrate_query_timeout
    from serialization import load_results
    from serialization import find_last_result
    from cluster import cluster_gps_to_reduce_queries
    from gp_learner import init_workers

    # init workers
    init_workers()

    sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint)
    timeout = timeout if timeout > 0 else calibrate_query_timeout(sparql)

    # load model
    last_res = find_last_result()
    if not last_res:
        logger.error('cannot find fully trained model in %s', resdir)
        sys.exit(1)
    result_patterns, coverage_counts, gtp_scores = load_results(last_res)
    gps = [gp for gp, _ in result_patterns]
    gps = cluster_gps_to_reduce_queries(
        gps, max_queries, gtp_scores, clustering_variant)

    processed = 0
    start = time.time()
    batch_size = config.BATCH_SIZE if batch_predict else 1
    # main loop
    for lines in chunker(sys.stdin, batch_size):
        batch = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if drop_bad_uris:
                # noinspection PyBroadException
                try:
                    source = from_n3(line)
                    utils.curify(source)
                except Exception:
                    logger.warning(
                        'Warning: Could not curify URI %s! Skip.', line)
                    continue
            if line[0] not in '<"':
                logger.error(
                    'expected inputs to start with < or ", but got: %s', line)
                sys.exit(1)
            source = from_n3(line)
            batch.append(source)
        batch = list(OrderedDict.fromkeys(batch))

        if len(batch) == 0:
            pass
        elif len(batch) == 1:
            res = predict(
                sparql, timeout, gps, batch[0], fusion_methods,
                max_results, max_target_candidates_per_gp
            )
            print(json.dumps(res))
            logger.info(
                'Predicted %d target candidates for %s',
                res['orig_result_length'], res['source']
            )
        else:
            res = multi_predict(
                sparql, timeout, gps, batch, fusion_methods,
                max_results, max_target_candidates_per_gp
            )
            for r in res:
                print(json.dumps(r))
            logger.info('\n'.join([
                'Predicted %d target candidates for %s' % (
                    r['orig_result_length'], r['source']
                ) for r in res
            ]))

        processed += len(batch)
        logger.info(
            'Have processed %d URIs now. Took %s sec',
            processed, time.time()-start)
from graph_pattern import GraphPattern
from graph_pattern import SOURCE_VAR
from graph_pattern import TARGET_VAR
from ground_truth_tools import get_semantic_associations
from ground_truth_tools import split_training_test_set
from gtp_scores import GTPScores

logger = logging.getLogger(__name__)

a = URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
dbp = rdflib.Namespace('http://dbpedia.org/resource/')
wikilink = URIRef('http://dbpedia.org/ontology/wikiPageWikiLink')

sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT)
try:
    timeout = max(5, calibrate_query_timeout(sparql))  # 5s for warmup
except IOError:
    from nose import SkipTest
    raise SkipTest(
        "Can't establish connection to SPARQL_ENDPOINT:\n    %s\n"
        "Skipping tests in\n    %s" % (SPARQL_ENDPOINT, __file__))
ground_truth_pairs = get_semantic_associations()
ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs)
gtp_scores = GTPScores(ground_truth_pairs)


def test_evaluate():
    gp = GraphPattern((
        (SOURCE_VAR, wikilink, TARGET_VAR),
        (SOURCE_VAR, a, URIRef('http://dbpedia.org/ontology/PopulatedPlace')),
        (TARGET_VAR, a, URIRef('http://schema.org/Country'))
Пример #6
0
def main(
        resdir,
        sparql_endpoint,
        max_queries,
        clustering_variant,
        fusion_methods,
        timeout,
        max_results,
        max_target_candidates_per_gp,
        batch_predict,
        drop_bad_uris,
        **_  # gulp remaining kwargs
):
    from gp_query import calibrate_query_timeout
    from serialization import load_results
    from serialization import find_last_result
    from cluster import cluster_gps_to_reduce_queries
    from gp_learner import init_workers

    # init workers
    init_workers()

    sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint)
    timeout = timeout if timeout > 0 else calibrate_query_timeout(sparql)

    # load model
    last_res = find_last_result()
    if not last_res:
        logger.error('cannot find fully trained model in %s', resdir)
        sys.exit(1)
    result_patterns, coverage_counts, gtp_scores = load_results(last_res)
    gps = [gp for gp, _ in result_patterns]
    gps = cluster_gps_to_reduce_queries(gps, max_queries, gtp_scores,
                                        clustering_variant)

    processed = 0
    start = time.time()
    batch_size = config.BATCH_SIZE if batch_predict else 1
    # main loop
    for lines in chunker(sys.stdin, batch_size):
        batch = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if drop_bad_uris:
                # noinspection PyBroadException
                try:
                    source = from_n3(line)
                    utils.curify(source)
                except Exception:
                    logger.warning('Warning: Could not curify URI %s! Skip.',
                                   line)
                    continue
            if line[0] not in '<"':
                logger.error(
                    'expected inputs to start with < or ", but got: %s', line)
                sys.exit(1)
            source = from_n3(line)
            batch.append(source)
        batch = list(OrderedDict.fromkeys(batch))

        if len(batch) == 0:
            pass
        elif len(batch) == 1:
            res = predict(sparql, timeout, gps, batch[0], fusion_methods,
                          max_results, max_target_candidates_per_gp)
            print(json.dumps(res))
            logger.info('Predicted %d target candidates for %s',
                        res['orig_result_length'], res['source'])
        else:
            res = multi_predict(sparql, timeout, gps, batch, fusion_methods,
                                max_results, max_target_candidates_per_gp)
            for r in res:
                print(json.dumps(r))
            logger.info('\n'.join([
                'Predicted %d target candidates for %s' %
                (r['orig_result_length'], r['source']) for r in res
            ]))

        processed += len(batch)
        logger.info('Have processed %d URIs now. Took %s sec', processed,
                    time.time() - start)