Exemplo n.º 1
0
    def collect(details):
        """
        For this source, one SPARQL endpoint is given for a series of vocabs which are all separate ConceptSchemes

        'ga-jena-fuseki': {
            'source': VocabSource.SPARQL,
            'sparql_endpoint': 'http://dev2.nextgen.vocabs.ga.gov.au/fuseki/vocabs',
            'sparql_username': '******', # Optional username for SPARQL endpoint
            'sparql_password': '******', # Optional password for SPARQL endpoint
            #'uri_filter_regex': '.*', # Regular expression to filter vocabulary URIs - Everything
            #'uri_filter_regex': '^http(s?)://pid.geoscience.gov.au/def/voc/ga/', # Regular expression to filter vocabulary URIs - GA
            #'uri_filter_regex': '^https://gcmdservices.gsfc.nasa.gov', # Regular expression to filter vocabulary URIs - GCMD
            'uri_filter_regex': '^http(s?)://resource.geosciml.org/', # Regular expression to filter vocabulary URIs - CGI

        },
        """
        logging.debug("SPARQL collect()...")

        # Get all the ConceptSchemes from the SPARQL endpoint
        # Interpret each CS as a Vocab
        q = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT * WHERE {{
    {{ GRAPH ?g {{
        ?cs a skos:ConceptScheme .
        OPTIONAL {{ ?cs skos:prefLabel ?title .
            FILTER(lang(?title) = "{language}" || lang(?title) = "") }}
        OPTIONAL {{ ?cs dcterms:created ?created }}
        OPTIONAL {{ ?cs dcterms:issued ?issued }}
        OPTIONAL {{ ?cs dcterms:modified ?modified }}
        OPTIONAL {{ ?cs owl:versionInfo ?version }}
        OPTIONAL {{ ?cs skos:definition ?description .
            FILTER(lang(?description) = "{language}" || lang(?description) = "") }}
    }} }}
    UNION
    {{
        ?cs a skos:ConceptScheme .
        OPTIONAL {{ ?cs skos:prefLabel ?title .
            FILTER(lang(?title) = "{language}" || lang(?title) = "") }}
        OPTIONAL {{ ?cs dcterms:created ?created }}
        OPTIONAL {{ ?cs dcterms:issued ?issued }}
        OPTIONAL {{ ?cs dcterms:modified ?modified }}
        OPTIONAL {{ ?cs owl:versionInfo ?version }}
        OPTIONAL {{ ?cs skos:definition ?description .
            FILTER(lang(?description) = "{language}" || lang(?description) = "") }}
    }}
}} 
ORDER BY ?title""".format(language=DEFAULT_LANGUAGE)
        # record just the IDs & title for the VocPrez in-memory vocabs list
        concept_schemes = Source.sparql_query(
            details["sparql_endpoint"],
            q,
            sparql_username=details.get("sparql_username"),
            sparql_password=details.get("sparql_password"),
        )
        assert concept_schemes is not None, "Unable to query conceptSchemes"

        sparql_vocabs = {}
        for cs in concept_schemes:
            # handling CS URIs that end with '/'
            vocab_id = cs["cs"]["value"].replace("/conceptScheme",
                                                 "").split("/")[-1]

            # TODO: Investigate putting regex into SPARQL query
            # print("re.search('{}', '{}')".format(details.get('uri_filter_regex'), cs['cs']['value']))
            if details.get("uri_filter_regex") and not re.search(
                    details["uri_filter_regex"], cs["cs"]["value"]):
                logging.debug("Skipping vocabulary {}".format(vocab_id))
                continue

            if len(vocab_id) < 2:
                vocab_id = cs["cs"]["value"].split("/")[-2]

            sparql_vocabs[vocab_id] = Vocabulary(
                vocab_id,
                cs["cs"]["value"],
                cs["title"].get("value") or vocab_id if cs.get("title") else
                vocab_id,  # Need string value for sorting, not None
                cs["description"].get("value")
                if cs.get("description") is not None else None,
                None,  # none of these SPARQL vocabs have creator info yet # TODO: add creator info to GSQ vocabs
                dateutil.parser.parse(cs.get("created").get("value"))
                if cs.get("created") is not None else None,
                # dct:issued not in Vocabulary
                # dateutil.parser.parse(cs.get('issued').get('value')) if cs.get('issued') is not None else None,
                dateutil.parser.parse(cs.get("modified").get("value"))
                if cs.get("modified") is not None else None,
                cs["version"].get("value")
                if cs.get("version") is not None else None,  # versionInfo
                config.VocabSource.SPARQL,
                cs["cs"]["value"],
                sparql_endpoint=details["sparql_endpoint"],
                sparql_username=details["sparql_username"],
                sparql_password=details["sparql_password"],
            )
        g.VOCABS = {**g.VOCABS, **sparql_vocabs}
        logging.debug("SPARQL collect() complete.")
Exemplo n.º 2
0
    def get_top_concepts(self):
        # same as parent query, only running against rdflib in-memory graph, not SPARQL endpoint
        vocab = g.VOCABS[self.vocab_id]
        q = """
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            SELECT DISTINCT ?tc ?pl
            WHERE {{
                {{ GRAPH ?g 
                    {{
                        {{
                            <{concept_scheme_uri}> skos:hasTopConcept ?tc .                
                        }}
                        UNION 
                        {{
                            ?tc skos:topConceptOf <{concept_scheme_uri}> .
                        }}
                        {{ ?tc skos:prefLabel ?pl .
                            FILTER(lang(?pl) = "{language}" || lang(?pl) = "") 
                        }}
                    }}
                }}
                UNION
                {{
                    {{
                        <{concept_scheme_uri}> skos:hasTopConcept ?tc .                
                    }}
                    UNION 
                    {{
                        ?tc skos:topConceptOf <{concept_scheme_uri}> .
                    }}
                    {{ ?tc skos:prefLabel ?pl .
                        FILTER(lang(?pl) = "{language}" || lang(?pl) = "")
                    }}
                }}
            }}
            ORDER BY ?pl
            """.format(concept_scheme_uri=vocab.concept_scheme_uri,
                       language=self.language)
        top_concepts = Source.sparql_query(vocab.sparql_endpoint, q,
                                           vocab.sparql_username,
                                           vocab.sparql_password)

        if top_concepts is not None:
            # cache prefLabels and do not add duplicates. This prevents Concepts with sameAs properties appearing twice
            pl_cache = []
            tcs = []
            for tc in top_concepts:
                if tc[1] not in pl_cache:  # only add if not already in cache
                    tcs.append((tc[0], tc[1]))
                    pl_cache.append(tc[1])

            if len(tcs) == 0:
                q = """
                    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                    SELECT DISTINCT ?tc ?pl
                    WHERE {{
                        {{ GRAPH ?g {{
                            {{
                                <{concept_scheme_uri}> skos:hasTopConcept ?tc .                
                            }}
                            UNION 
                            {{
                                ?tc skos:inScheme <{concept_scheme_uri}> .
                            }}
                            {{ ?tc skos:prefLabel ?pl .
                                FILTER(lang(?pl) = "{language}" || lang(?pl) = "") 
                            }}
                        }} }}
                        UNION
                        {{
                            {{
                                <{concept_scheme_uri}> skos:hasTopConcept ?tc .                
                            }}
                            UNION 
                            {{
                                ?tc skos:inScheme <{concept_scheme_uri}> .
                            }}
                            {{ ?tc skos:prefLabel ?pl .
                                FILTER(lang(?pl) = "{language}" || lang(?pl) = "")
                            }}
                        }}
                    }}
                    ORDER BY ?pl
                    """.format(concept_scheme_uri=vocab.concept_scheme_uri,
                               language=self.language)
                for tc in self.gr.query(q):
                    if tc[1] not in pl_cache:  # only add if not already in cache
                        tcs.append((tc[0], tc[1]))
                        pl_cache.append(tc[1])

            return tcs
        else:
            return None
Exemplo n.º 3
0
    def collect(details):
        """
        For this source, one SPARQL endpoint is given for a series of vocabs which are all separate ConceptSchemes

        'gsq-graphdb': {
            'source': VocabSource.SPARQL,
            'sparql_endpoint': 'http://graphdb.gsq.digital:7200/repositories/GSQ_Vocabularies_core'
        },
        """
        logging.debug('SPARQL collect()...')

        # Get all the ConceptSchemes from the SPARQL endpoint
        # Interpret each CS as a Vocab
        q = '''
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX dcterms: <http://purl.org/dc/terms/>
            SELECT * WHERE {{
                GRAPH ?g {{
                    ?cs a skos:ConceptScheme .
                    OPTIONAL {{ ?cs skos:prefLabel ?title .
                        FILTER(lang(?title) = "{language}" || lang(?title) = "") }}
                    OPTIONAL {{ ?cs dcterms:created ?created }}
                    OPTIONAL {{ ?cs dcterms:issued ?issued }}
                    OPTIONAL {{ ?cs dcterms:modified ?modified }}
                    OPTIONAL {{ ?cs skos:definition ?description .
                        FILTER(lang(?description) = "{language}" || lang(?description) = "") }}
                }}
            }} 
            ORDER BY ?l
        '''.format(language=DEFAULT_LANGUAGE)
        # record just the IDs & title for the VocPrez in-memory vocabs list
        concept_schemes = Source.sparql_query(
            details['sparql_endpoint'],
            q,
            sparql_username=details.get('sparql_username'),
            sparql_password=details.get('sparql_password')) or {}
        sparql_vocabs = {}
        for cs in concept_schemes:
            # handling CS URIs that end with '/'
            vocab_id = cs['cs']['value'].replace('/conceptScheme',
                                                 '').split('/')[-1]

            #print("re.search('{}', '{}')".format(details.get('uri_filter_regex'), cs['cs']['value']))
            if details.get('uri_filter_regex') and not re.search(
                    details['uri_filter_regex'], cs['cs']['value']):
                logging.debug('Skipping vocabulary {}'.format(vocab_id))
                continue

            if len(vocab_id) < 2:
                vocab_id = cs['cs']['value'].split('/')[-2]

            sparql_vocabs[vocab_id] = Vocabulary(
                vocab_id,
                cs['cs']['value'].replace('/conceptScheme', ''),
                cs['title'].get('value') or vocab_id if cs.get('title') else
                vocab_id,  # Need string value for sorting, not None
                cs['description'].get('value')
                if cs.get('description') is not None else None,
                None,  # none of these SPARQL vocabs have creator info yet # TODO: add creator info to GSQ vocabs
                dateutil.parser.parse(cs.get('created').get('value'))
                if cs.get('created') is not None else None,
                # dct:issued not in Vocabulary
                # dateutil.parser.parse(cs.get('issued').get('value')) if cs.get('issued') is not None else None,
                dateutil.parser.parse(cs.get('modified').get('value'))
                if cs.get('modified') is not None else None,
                None,  # versionInfo
                config.VocabSource.SPARQL,
                cs['cs']['value'],
                sparql_endpoint=details['sparql_endpoint'],
                sparql_username=details['sparql_username'],
                sparql_password=details['sparql_password'])
        g.VOCABS = {**g.VOCABS, **sparql_vocabs}
        logging.debug('SPARQL collect() complete.')