Exemplo n.º 1
0
    def test_query_ok(self):
        from webui.cnmain.utils import get_virtuoso

        graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped']
        get_virtuoso('master').ingest(self._get_test_file(
            'boardgamegeek-games-mapped.nt', 'scheduler'),
                                      graph=graph_pref + 'test_graph')

        response = self._test_query("""
def nodes() {
    return g.V('type', 'sd:BoardGame').id.collect{it}
}

def slice(nodes_id) {
    m = []
    nodes_id.each{ node_id ->
        g.v(node_id).transform{ node ->
            data = [acheneID: node['sd:acheneID']]
            data.provenance = node.out('bristle').out('source').name.collect{
                it}.join(',')

            return data
        }.fill(m)
    }
    return m
}
        """)

        self.assertEqual(response.status_code, 200)
Exemplo n.º 2
0
    def test_query_ok(self):
        from webui.cnmain.utils import get_virtuoso

        graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped']
        get_virtuoso('master').ingest(
            self._get_test_file('boardgamegeek-games-mapped.nt', 'scheduler'),
            graph=graph_pref + 'test_graph'
        )

        response = self._test_query("""
def nodes() {
    return g.V('type', 'sd:BoardGame').id.collect{it}
}

def slice(nodes_id) {
    m = []
    nodes_id.each{ node_id ->
        g.v(node_id).transform{ node ->
            data = [acheneID: node['sd:acheneID']]
            data.provenance = node.out('bristle').out('source').name.collect{
                it}.join(',')

            return data
        }.fill(m)
    }
    return m
}
        """)

        self.assertEqual(response.status_code, 200)
Exemplo n.º 3
0
    def _clear_graphs():
        from webui.cnmain.utils import get_virtuoso

        get_virtuoso('default').clear_regex(
            settings.TRIPLE_DATABASE['PREFIXES']['graph'])
        get_virtuoso('master').clear_regex(
            settings.TRIPLE_DATABASE['PREFIXES']['graph'])
Exemplo n.º 4
0
    def _clear_graphs():
        from webui.cnmain.utils import get_virtuoso

        get_virtuoso('default').clear_regex(
            settings.TRIPLE_DATABASE['PREFIXES']['graph']
        )
        get_virtuoso('master').clear_regex(
            settings.TRIPLE_DATABASE['PREFIXES']['graph']
        )
Exemplo n.º 5
0
    def test_source_with_refine_rdf_rule(self):
        source = Source.objects.get(name='BoardGameTournament (test)')
        process_source.delay(source)

        path = self._get_test_file(
            "boardgametournament_refine_rules.json", "cnmain"
        )
        with open(path) as f:
            rule = f.read()

        dataset = source.datasets.get(name="boardgametournament-games")
        archive_item = dataset.archive_items.get()
        archive_item.rule = RuleFactory(
            rule=rule,
            hash=archive_item.file_hash
        )
        archive_item.save(force_update=True)

        process_source.delay(source)

        from webui.cnmain.utils import get_virtuoso
        virtuoso = get_virtuoso()
        row_id = archive_item.datagraph_mapped_row_id("0")

        self._assert_description(virtuoso, row_id, [
            ("http://ontologies.venturi.eu/v1#name",
             "Dominion"),
        ])

        row_id = archive_item.datagraph_mapped_row_id("1")

        self._assert_description(virtuoso, row_id, [
            ("http://ontologies.venturi.eu/v1#name",
             "Carcassonne"),
        ])
Exemplo n.º 6
0
    def test_source_scraperwiki(self):
        Scheduler.objects.all().delete()
        ArchiveItem.objects.all().delete()
        source = Source.objects.get(name='trentinocultura')
        process_source.delay(source)

        dataset = source.datasets.get()
        archive_item = source.datasets.get().archive_items.get()
        self._assert_archive_item(
            archive_item,
            (u'category', u'city', u'title', u'url', u'price',
             u'hours', u'website', u'phone', u'location', u'address', u'date',
             u'notes', u'email', u'organizer', u'other_info', u'fax'),
            49
        )

        from webui.cnmain.utils import get_virtuoso
        virtuoso = get_virtuoso()
        source_meta_id = source.metagraph_resource_id
        dataset_meta_id = dataset.metagraph_resource_id

        from rdflib import Namespace
        METAPROP = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['meta'])
        SDOWL = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['sdowl'])
        RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'

        self._assert_description(virtuoso, source_meta_id, [
            (METAPROP['description'], source.description),
            (RDF_TYPE, SDOWL['Source'], 'iri'),
        ])
        self._assert_description(virtuoso, dataset_meta_id, [
            (METAPROP['download'], dataset.download),
            (RDF_TYPE, SDOWL['Dataset'], 'iri'),
            (SDOWL['belongs_to_source'], source_meta_id, 'iri'),
        ])
Exemplo n.º 7
0
    def test_query_ok(self):
        graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped']
        get_virtuoso('master').ingest(self._get_test_file(
            'boardgamegeek-games-mapped.nt', 'scheduler'),
                                      graph=graph_pref + 'test_graph')

        results = get_cleaned_sliced_data(query=THE_QUERY,
                                          fields='acheneID,provenance',
                                          with_header=True)
        header = next(results)
        self.assertIsInstance(header, list)
        self.assertGreater(len(list(results)), 0)

        results = get_sliced_data(query=THE_QUERY,
                                  fields='acheneID,provenance',
                                  with_header=False)
        results = list(results)
        self.assertIsInstance(results[0], dict)
Exemplo n.º 8
0
 def handle(self, *args, **options):
     """
     entry point
     """
     for instance in ('default', 'master'):
         print "Installing on virtuoso", instance
         virtuoso = get_virtuoso(instance)
         virtuoso.install_extensions()
         print
Exemplo n.º 9
0
def main(args):
    """ the body of the script
    """
    tmpdir = mkdtemp()

    if args.file.startswith(('http://', 'https://')):
        print "The file is in the net, downloading it..."
        file_basename = os.path.basename(
            urllib2.urlparse.urlsplit(args.file).path
        )

        result = envoy.run('wget "{}" -O {}'.format(args.file, file_basename))

        if result.status_code:
            print_error_result(
                result, "Error while downloading RDF data {}. Aborting".format(
                    args.file
                )
            )
            exit(1)

        filename = os.path.join(
            tmpdir, file_basename
        )
    else:
        print "The file is  local, moving it..."
        shutil.copy(args.file, tmpdir)
        filename = os.path.join(tmpdir, os.path.basename(args.file))

    print "handling file", filename

    filename_cropped, extension = os.path.splitext(filename)
    if extension == '.bz2':
        print "Got a bz2 file, need to convert it with gzip"
        gzip_filename = filename_cropped + '.gz'

        result = envoy.run('bunzip2 "{}" -c | gzip > "{}'.format(
            filename, gzip_filename
        ))

        if result.status_code:
            print_error_result(result, "Error while converting file, aborting")
            exit(2)

        filename = gzip_filename
        print "File converted successfully, now handling", filename

    print "Ingesting file in virtuoso"
    virtuoso = get_virtuoso()
    virtuoso.clear(args.graph)
    print "Ingestion completed", virtuoso.ingest(filename, graph=args.graph)
Exemplo n.º 10
0
    def test_query_ok(self):
        graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped']
        get_virtuoso('master').ingest(
            self._get_test_file('boardgamegeek-games-mapped.nt', 'scheduler'),
            graph=graph_pref + 'test_graph'
        )

        results = get_cleaned_sliced_data(
            query=THE_QUERY,
            fields='acheneID,provenance',
            with_header=True
        )
        header = next(results)
        self.assertIsInstance(header, list)
        self.assertGreater(len(list(results)), 0)

        results = get_sliced_data(
            query=THE_QUERY,
            fields='acheneID,provenance',
            with_header=False
        )
        results = list(results)
        self.assertIsInstance(results[0], dict)
Exemplo n.º 11
0
    def handle(self, *args, **options):
        """
        Resets the virtuoso graph for this project.
        """
        from webui.cnmain.utils import get_virtuoso

        got_graph_settings = self.get_graph_settings(*args, **options)
        if not got_graph_settings:
            raise CommandError("The --router option is mandatory")
            return

        virtuoso = get_virtuoso(self.router)
        cleared = virtuoso.clear_regex(r'.*')

        print "Cleared {} graphs".format(cleared)
Exemplo n.º 12
0
    def handle(self, *args, **options):
        """
        Resets the virtuoso graph for this project.
        """
        from webui.cnmain.utils import get_virtuoso

        got_graph_settings = self.get_graph_settings(*args, **options)
        if not got_graph_settings:
            raise CommandError("The --router option is mandatory")
            return

        virtuoso = get_virtuoso(self.router)
        cleared = virtuoso.clear_regex(r'.*')

        print "Cleared {} graphs".format(cleared)
Exemplo n.º 13
0
def refresh_sources(source_id=None):
    """ generate a .trig file for the source, and ingest it into virtuoso
    """
    sources = [Source.objects.get(pk=source_id)] \
        if source_id else Source.objects.all()

    filename = 'source-{}.trig'.format(source_id if source_id else 'all')
    n_triples = 0

    clear_graphs = []
    with closing(TrigFile(filename)) as trig:
        meta_graph = trig.add_graph(PREFIXES['meta_graph'])
        for source in sources:
            # add triples for source metadata
            for quad in source_meta_quads(source):
                meta_graph.add_triple(quad)
                n_triples += 1

            for dataset in source.datasets.all():
                # add triples for dataset metadata
                for quad in dataset_meta_quads(dataset):
                    meta_graph.add_triple(quad)
                    n_triples += 1

                for archive_item in dataset.archive_items.all():
                    # add triples for archive_item metadata
                    for quad in archive_item_meta_quads(archive_item):
                        meta_graph.add_triple(quad)
                        n_triples += 1
                    data_graph = trig.add_graph(
                        archive_item.datagraph_raw_name
                    )
                    clear_graphs.append(data_graph.name)
                    # add triples for archive item
                    for quad in archive_item_data_quads(archive_item):
                        data_graph.add_triple(quad)
                        n_triples += 1

    from webui.cnmain.utils import get_virtuoso
    virtuoso = get_virtuoso()
    logger.debug('ingesting {} into virtuoso'.format(filename))
    virtuoso.clear(clear_graphs)
    virtuoso.ingest(filename)

    return n_triples
Exemplo n.º 14
0
def main(args):
    """ the body of the script
    """
    tmpdir = mkdtemp()

    if args.file.startswith(('http://', 'https://')):
        print "The file is in the net, downloading it..."
        file_basename = os.path.basename(
            urllib2.urlparse.urlsplit(args.file).path)

        result = envoy.run('wget "{}" -O {}'.format(args.file, file_basename))

        if result.status_code:
            print_error_result(
                result, "Error while downloading RDF data {}. Aborting".format(
                    args.file))
            exit(1)

        filename = os.path.join(tmpdir, file_basename)
    else:
        print "The file is  local, moving it..."
        shutil.copy(args.file, tmpdir)
        filename = os.path.join(tmpdir, os.path.basename(args.file))

    print "handling file", filename

    filename_cropped, extension = os.path.splitext(filename)
    if extension == '.bz2':
        print "Got a bz2 file, need to convert it with gzip"
        gzip_filename = filename_cropped + '.gz'

        result = envoy.run('bunzip2 "{}" -c | gzip > "{}'.format(
            filename, gzip_filename))

        if result.status_code:
            print_error_result(result, "Error while converting file, aborting")
            exit(2)

        filename = gzip_filename
        print "File converted successfully, now handling", filename

    print "Ingesting file in virtuoso"
    virtuoso = get_virtuoso()
    virtuoso.clear(args.graph)
    print "Ingestion completed", virtuoso.ingest(filename, graph=args.graph)
Exemplo n.º 15
0
def refresh_sources(source_id=None):
    """ generate a .trig file for the source, and ingest it into virtuoso
    """
    sources = [Source.objects.get(pk=source_id)] \
        if source_id else Source.objects.all()

    filename = 'source-{}.trig'.format(source_id if source_id else 'all')
    n_triples = 0

    clear_graphs = []
    with closing(TrigFile(filename)) as trig:
        meta_graph = trig.add_graph(PREFIXES['meta_graph'])
        for source in sources:
            # add triples for source metadata
            for quad in source_meta_quads(source):
                meta_graph.add_triple(quad)
                n_triples += 1

            for dataset in source.datasets.all():
                # add triples for dataset metadata
                for quad in dataset_meta_quads(dataset):
                    meta_graph.add_triple(quad)
                    n_triples += 1

                for archive_item in dataset.archive_items.all():
                    # add triples for archive_item metadata
                    for quad in archive_item_meta_quads(archive_item):
                        meta_graph.add_triple(quad)
                        n_triples += 1
                    data_graph = trig.add_graph(
                        archive_item.datagraph_raw_name)
                    clear_graphs.append(data_graph.name)
                    # add triples for archive item
                    for quad in archive_item_data_quads(archive_item):
                        data_graph.add_triple(quad)
                        n_triples += 1

    from webui.cnmain.utils import get_virtuoso
    virtuoso = get_virtuoso()
    logger.debug('ingesting {} into virtuoso'.format(filename))
    virtuoso.clear(clear_graphs)
    virtuoso.ingest(filename)

    return n_triples
Exemplo n.º 16
0
    def test_source_archive(self):
        Scheduler.objects.all().delete()
        ArchiveItem.objects.all().delete()
        source = Source.objects.get(name='in-giro (locale)')
        dataset = source.datasets.get()
        process_source.delay(source)

        events_item, poi_event = dataset.archive_items.all().\
            order_by("file_hash")

        self._assert_archive_item(
            poi_event,
            (u'website', u'city', u'name', u'url', u'phone', u'address',
             u'location_type', u'description', u'province'),
            158
        )

        self._assert_archive_item(
            events_item,
            (u'city', u'description', u'url', u'date', u'location',
             u'genre', u'location_url'),
            497
        )

        from webui.cnmain.utils import get_virtuoso
        virtuoso = get_virtuoso()
        source_meta_id = source.metagraph_resource_id
        dataset_meta_id = dataset.metagraph_resource_id

        from rdflib import Namespace
        METAPROP = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['meta'])
        SDOWL = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['sdowl'])
        RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'

        self._assert_description(virtuoso, source_meta_id, [
            (METAPROP['description'], source.description),
            (RDF_TYPE, SDOWL['Source'], 'iri'),
        ])
        self._assert_description(virtuoso, dataset_meta_id, [
            (METAPROP['download'], dataset.download),
            (RDF_TYPE, SDOWL['Dataset'], 'iri'),
            (SDOWL['belongs_to_source'], source_meta_id, 'iri'),
        ])
Exemplo n.º 17
0
def __aggregator_process_archiveitem(aggregator_archive_item, scheduler,
                                     tmpdir, context):
    import envoy
    from django.template.loader import render_to_string
    from webui.cnmain.utils import get_virtuoso

    virtuoso_simple = get_virtuoso()
    virtuoso_master = get_virtuoso('master')
    loggy = local.logger

    aggregator = aggregator_archive_item.aggregator
    archive_item = aggregator_archive_item.archiveitem

    #
    # PART 1: generate XML file
    #

    loggy.debug("Processing " + unicode(archive_item))

    output_filename = None
    if not aggregator.silk_rule:
        loggy.warning('No silk rule found, skipping')
        scheduler.status = Scheduler.INCOMPLETE
    else:
        output_filename = os.path.join(tmpdir, archive_item.file_hash + '.nt')
        conf_filename = os.path.join(tmpdir,
                                     archive_item.file_hash + '_conf.xml')

        silk_conf_xml = render_to_string(
            'controller/aggregator/silk_rules.xml',
            dict(context,
                 archive_item=archive_item,
                 output_filename=output_filename))

        with open(conf_filename, 'w') as fconf:
            fconf.write(silk_conf_xml)

        #
        # PART 2: execute SILK
        #
        loggy.info("Executing SILK on %s", unicode(archive_item))
        result = envoy.connect(
            'java -Xmx{} -DconfigFile={} -Dthreads={} '
            '-cp "{}:{}/*" de.fuberlin.wiwiss.silk.Silk'.format(
                settings.SILK_SINGLE_MACHINE_HEAP,
                conf_filename,
                settings.SILK_SINGLE_MACHINE_THREADS,
                SILK_JAR_PATH,
                SILK_LIB_PATH,
            ))

        level = None
        status = 0
        titan_log_cnt = 0
        # pylint: disable=W0212
        while result._process.poll() is None:
            line = result._process.stderr.readline()\
                         .strip().replace('%', '%%')

            if not line:
                continue

            tmplevel = line.split(":", 1)[0]
            if tmplevel in LEVEL_LIST:
                level = tmplevel
            if line.startswith("Exception in thread"):
                level = "EXCEPTION"

            if level == "EXCEPTION":
                status = 2
                loggy.error("S> " + line)
            elif level in LEVEL_OUT:
                status = 1
                loggy.warn("S> " + line)
            elif re.search(r"Finished writing \d+ entities", line) or \
                    re.search(r"Got \d+ vertices", line) or \
                    re.search(r"Wrote \d+ links", line):
                loggy.info("S> " + line)
            elif re.search(r"Getting data for vertices", line):
                if titan_log_cnt % 200 == 0:
                    loggy.info("S> " + line)
                titan_log_cnt += 1
            # pylint: enable=W0212

        if status:
            loggy.error("SILK failed on %s", unicode(archive_item))
            scheduler.status = Scheduler.FAIL
            if status == 2:
                return
        else:
            loggy.info("SILK executed successfully")
            # loggy.debug("Generated file: %s", output_filename)

    #
    # PART 3: dump graph data
    #
    dump_dir = '{}/'.format(archive_item.file_hash)
    loggy.info("Creating a dump of the namedgraph {}".format(
        archive_item.datagraph_mapped_name))

    error = virtuoso_simple.dump_graph(archive_item.datagraph_mapped_name,
                                       dump_dir,
                                       create_dir=True)

    if error:
        loggy.error("Dump failed:")
        for line in error:
            loggy.error(line)
        raise Exception("Dump of the namedgraph failed: {}".format(error))

    #
    # PART 4: load graph data in the master virtuoso instance
    #
    # we are assuming that the two virtuoso are on the same machine
    loggy.info("Loading dump in the master graph as {}".format(
        archive_item.datagraph_mapped_name))

    # clear the entire named database before ingesting the data
    # since we're on titan we don't want this anymore
    # virtuoso_master.clear(archive_item.datagraph_mapped_name)

    # loggy.warning("Leaving data dump available for testing purposes")
    # error = virtuoso_master.load_graphs(dump_dir, remove_dir=False)
    error = virtuoso_master.load_graphs(dump_dir, remove_dir=True)

    if error:
        loggy.error("Load failed:")
        if isinstance(error, basestring):
            loggy.error(error)
        else:
            for line in error:
                loggy.error(line)
        raise Exception("Load of the namedgraph failed: {}".format(error))

    if aggregator.silk_rule:
        #
        # PART 5: load SILK generated tuples
        #
        loggy.info("Loading SILK generated tuples")
        virtuoso_master.ingest(
            output_filename,
            settings.TRIPLE_DATABASE['PREFIXES']['silk_graph'],
        )

    now = timezone.now()
    aggregator_archive_item.last_workflow_success = now
    if aggregator_archive_item.first_workflow_success is None:
        aggregator_archive_item.first_workflow_success = now
    aggregator_archive_item.save()
Exemplo n.º 18
0
    def get_context_data(self, **kwargs):
        from webui.cnmain.utils import get_virtuoso

        # pylint: disable=W0201
        self.object = archive_item = self.get_object()

        context = super(ArchiveItemMappedStatsView, self).get_context_data(
            **kwargs
        )
        context['archiveitems'] = []
        context['object'] = archive_item

        graph = archive_item.datagraph_mapped_name
        queries = []

        virtuoso = get_virtuoso()
        queries.append(('no_type', """
            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource ?b ?c .
                    OPTIONAL { ?resource rdf:type ?d . } .
                    FILTER (!BOUND(?d)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_achene', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:acheneID ?achene . } .
                    FILTER (!BOUND(?achene)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_category', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:category ?cat . } .
                    FILTER (!BOUND(?cat)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_category', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:category ?cat . } .
                    FILTER (!BOUND(?cat)) .
                }
            }
        """ % graph))

        queries.append(('poi_old_style_category', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource sd:category ?cat .
                    FILTER (0 = regex(?cat, "%s[0-9a-f]{40}")) .
                }
            }
        """ % (graph, settings.TRIPLE_DATABASE['PREFIXES']['sdres'])))

        queries.append(('poi_latlon_and_geom', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    { ?resource sd:latitude ?b }
                    UNION
                    { ?resource sd:longitude ?b }
                    UNION
                    { ?resource sd:geometry ?b }
                }
            }
        """ % graph))

        queries.append(('poi_without_any_geometry', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource a sd:POI .
                    OPTIONAL {?resource sd:geomPoint ?g1} .
                    OPTIONAL {?resource sd:geomComplex ?g2} .
                    FILTER (!BOUND(?g1))
                    FILTER (!BOUND(?g2))
                }
            }
        """ % graph))

        queries.append(('poi_point_without_extra_info', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource sd:geomPoint ?b .
                    OPTIONAL {?resource sd:geomPointProvenance ?prov} .
                    OPTIONAL {?resource sd:geomPointAccuracy ?acc} .
                    FILTER (!BOUND(?prov))
                    FILTER (!BOUND(?acc))
                }
            }
        """ % graph))

        queries.append(('poi_complex_without_extra_info', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource sd:geomComplex ?b .
                    OPTIONAL {?resource sd:geomComplexProvenance ?prov} .
                    OPTIONAL {?resource sd:geomComplexAccuracy ?acc} .
                    FILTER (!BOUND(?prov))
                    FILTER (!BOUND(?acc))
                }
            }
        """ % graph))

        queries.append(('poi_no_label', """
            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource rdfs:label ?label . } .
                    FILTER (!BOUND(?label)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_name', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:name ?name . } .
                    FILTER (!BOUND(?name)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_isinnuts', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:isInNUTS ?nuts . } .
                    FILTER (!BOUND(?nuts)) .
                }
            }
        """ % graph))

        queries.append(('poi_isinnuts_type', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT ?nutsType (count(distinct ?resource) AS ?cnt) WHERE {
                GRAPH <%s> { ?resource a sd:POI } .
                ?resource sd:isInNUTS ?nuts .
                OPTIONAL {?nuts a ?nutsType}
            } GROUP BY ?nutsType
        """ % graph))

        results = {
            key: virtuoso.client_query(query).fetchall()
            for key, query in queries
        }

        context.update(results)

        return context
Exemplo n.º 19
0
def __aggregator_process_archiveitem(
        aggregator_archive_item, scheduler, tmpdir, context):
    import envoy
    from django.template.loader import render_to_string
    from webui.cnmain.utils import get_virtuoso

    virtuoso_simple = get_virtuoso()
    virtuoso_master = get_virtuoso('master')
    loggy = local.logger

    aggregator = aggregator_archive_item.aggregator
    archive_item = aggregator_archive_item.archiveitem

    #
    # PART 1: generate XML file
    #

    loggy.debug("Processing " + unicode(archive_item))

    output_filename = None
    if not aggregator.silk_rule:
        loggy.warning('No silk rule found, skipping')
        scheduler.status = Scheduler.INCOMPLETE
    else:
        output_filename = os.path.join(
            tmpdir, archive_item.file_hash + '.nt'
        )
        conf_filename = os.path.join(
            tmpdir, archive_item.file_hash + '_conf.xml'
        )

        silk_conf_xml = render_to_string(
            'controller/aggregator/silk_rules.xml',
            dict(context, archive_item=archive_item,
                 output_filename=output_filename)
        )

        with open(conf_filename, 'w') as fconf:
            fconf.write(silk_conf_xml)

        #
        # PART 2: execute SILK
        #
        loggy.info("Executing SILK on %s", unicode(archive_item))
        result = envoy.connect(
            'java -Xmx{} -DconfigFile={} -Dthreads={} '
            '-cp "{}:{}/*" de.fuberlin.wiwiss.silk.Silk'.format(
                settings.SILK_SINGLE_MACHINE_HEAP,
                conf_filename,
                settings.SILK_SINGLE_MACHINE_THREADS,
                SILK_JAR_PATH,
                SILK_LIB_PATH,
            )
        )

        level = None
        status = 0
        titan_log_cnt = 0
        # pylint: disable=W0212
        while result._process.poll() is None:
            line = result._process.stderr.readline()\
                         .strip().replace('%', '%%')

            if not line:
                continue

            tmplevel = line.split(":", 1)[0]
            if tmplevel in LEVEL_LIST:
                level = tmplevel
            if line.startswith("Exception in thread"):
                level = "EXCEPTION"

            if level == "EXCEPTION":
                status = 2
                loggy.error("S> " + line)
            elif level in LEVEL_OUT:
                status = 1
                loggy.warn("S> " + line)
            elif re.search(r"Finished writing \d+ entities", line) or \
                    re.search(r"Got \d+ vertices", line) or \
                    re.search(r"Wrote \d+ links", line):
                loggy.info("S> " + line)
            elif re.search(r"Getting data for vertices", line):
                if titan_log_cnt % 200 == 0:
                    loggy.info("S> " + line)
                titan_log_cnt += 1
            # pylint: enable=W0212

        if status:
            loggy.error("SILK failed on %s", unicode(archive_item))
            scheduler.status = Scheduler.FAIL
            if status == 2:
                return
        else:
            loggy.info("SILK executed successfully")
            # loggy.debug("Generated file: %s", output_filename)

    #
    # PART 3: dump graph data
    #
    dump_dir = '{}/'.format(archive_item.file_hash)
    loggy.info("Creating a dump of the namedgraph {}".format(
        archive_item.datagraph_mapped_name))

    error = virtuoso_simple.dump_graph(
        archive_item.datagraph_mapped_name, dump_dir, create_dir=True)

    if error:
        loggy.error("Dump failed:")
        for line in error:
            loggy.error(line)
        raise Exception("Dump of the namedgraph failed: {}".format(
            error
        ))

    #
    # PART 4: load graph data in the master virtuoso instance
    #
    # we are assuming that the two virtuoso are on the same machine
    loggy.info("Loading dump in the master graph as {}".format(
        archive_item.datagraph_mapped_name))

    # clear the entire named database before ingesting the data
    # since we're on titan we don't want this anymore
    # virtuoso_master.clear(archive_item.datagraph_mapped_name)

    # loggy.warning("Leaving data dump available for testing purposes")
    # error = virtuoso_master.load_graphs(dump_dir, remove_dir=False)
    error = virtuoso_master.load_graphs(dump_dir, remove_dir=True)

    if error:
        loggy.error("Load failed:")
        if isinstance(error, basestring):
            loggy.error(error)
        else:
            for line in error:
                loggy.error(line)
        raise Exception("Load of the namedgraph failed: {}".format(
            error
        ))

    if aggregator.silk_rule:
        #
        # PART 5: load SILK generated tuples
        #
        loggy.info("Loading SILK generated tuples")
        virtuoso_master.ingest(
            output_filename,
            settings.TRIPLE_DATABASE['PREFIXES']['silk_graph'],
        )

    now = timezone.now()
    aggregator_archive_item.last_workflow_success = now
    if aggregator_archive_item.first_workflow_success is None:
        aggregator_archive_item.first_workflow_success = now
    aggregator_archive_item.save()
Exemplo n.º 20
0
    def get_context_data(self, **kwargs):
        from webui.cnmain.utils import get_virtuoso

        # pylint: disable=W0201
        self.object = archive_item = self.get_object()

        context = super(ArchiveItemMappedStatsView,
                        self).get_context_data(**kwargs)
        context['archiveitems'] = []
        context['object'] = archive_item

        graph = archive_item.datagraph_mapped_name
        queries = []

        virtuoso = get_virtuoso()
        queries.append(('no_type', """
            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource ?b ?c .
                    OPTIONAL { ?resource rdf:type ?d . } .
                    FILTER (!BOUND(?d)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_achene', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:acheneID ?achene . } .
                    FILTER (!BOUND(?achene)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_category', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:category ?cat . } .
                    FILTER (!BOUND(?cat)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_category', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:category ?cat . } .
                    FILTER (!BOUND(?cat)) .
                }
            }
        """ % graph))

        queries.append(('poi_old_style_category', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource sd:category ?cat .
                    FILTER (0 = regex(?cat, "%s[0-9a-f]{40}")) .
                }
            }
        """ % (graph, settings.TRIPLE_DATABASE['PREFIXES']['sdres'])))

        queries.append(('poi_latlon_and_geom', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    { ?resource sd:latitude ?b }
                    UNION
                    { ?resource sd:longitude ?b }
                    UNION
                    { ?resource sd:geometry ?b }
                }
            }
        """ % graph))

        queries.append(('poi_without_any_geometry', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource a sd:POI .
                    OPTIONAL {?resource sd:geomPoint ?g1} .
                    OPTIONAL {?resource sd:geomComplex ?g2} .
                    FILTER (!BOUND(?g1))
                    FILTER (!BOUND(?g2))
                }
            }
        """ % graph))

        queries.append(('poi_point_without_extra_info', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource sd:geomPoint ?b .
                    OPTIONAL {?resource sd:geomPointProvenance ?prov} .
                    OPTIONAL {?resource sd:geomPointAccuracy ?acc} .
                    FILTER (!BOUND(?prov))
                    FILTER (!BOUND(?acc))
                }
            }
        """ % graph))

        queries.append(('poi_complex_without_extra_info', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource sd:geomComplex ?b .
                    OPTIONAL {?resource sd:geomComplexProvenance ?prov} .
                    OPTIONAL {?resource sd:geomComplexAccuracy ?acc} .
                    FILTER (!BOUND(?prov))
                    FILTER (!BOUND(?acc))
                }
            }
        """ % graph))

        queries.append(('poi_no_label', """
            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource rdfs:label ?label . } .
                    FILTER (!BOUND(?label)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_name', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:name ?name . } .
                    FILTER (!BOUND(?name)) .
                }
            }
        """ % graph))

        queries.append(('poi_no_isinnuts', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT (count(distinct ?resource) as ?count)
            WHERE
            {
                GRAPH <%s> {
                    ?resource rdf:type sd:POI .
                    OPTIONAL { ?resource sd:isInNUTS ?nuts . } .
                    FILTER (!BOUND(?nuts)) .
                }
            }
        """ % graph))

        queries.append(('poi_isinnuts_type', """
            PREFIX sd:<http://ontologies.venturi.eu/v1#>

            SELECT ?nutsType (count(distinct ?resource) AS ?cnt) WHERE {
                GRAPH <%s> { ?resource a sd:POI } .
                ?resource sd:isInNUTS ?nuts .
                OPTIONAL {?nuts a ?nutsType}
            } GROUP BY ?nutsType
        """ % graph))

        results = {
            key: virtuoso.client_query(query).fetchall()
            for key, query in queries
        }

        context.update(results)

        return context
Exemplo n.º 21
0
 def setUpClass(cls):
     from webui.cnmain.utils import get_virtuoso
     cls.virtuoso = get_virtuoso()
     cls.virtuoso_master = get_virtuoso('master')