def sparql_query( request: Request, query: Optional[ str] = "SELECT * WHERE { <https://identifiers.org/OMIM:246300> <https://w3id.org/biolink/vocab/treated_by> ?drug . }" ): # def sparql_query(query: Optional[str] = None): """ Send a SPARQL query to be executed. - Example with a drug: https://identifiers.org/DRUGBANK:DB00394 - Example with a disease: https://identifiers.org/OMIM:246300 \f :param query: SPARQL query input. """ if not query: # TODO: return the SPARQL enndpoint service description return {"SPARQL Service": "description"} if request.headers['accept'] == 'text/csv': # TODO: return in CSV format return Response('a,b,c', media_type='text/csv') else: parsed_query = translateQuery(Query.parseString(query, parseAll=True)) query_operation = re.sub(r"(\w)([A-Z])", r"\1 \2", parsed_query.algebra.name) if query_operation != "Select Query": return JSONResponse( status_code=501, content={"message": str(query_operation) + " not implemented"}) print(parsed_query) print(query_operation) predictions_list = query_classifier_from_sparql(parsed_query) return predictions_list
def get_metadata(rq): ''' Returns the metadata 'exp' parsed from the raw query file 'rq' 'exp' is one of: 'endpoint', 'tags', 'summary' ''' yaml_string = "\n".join([row.lstrip('#+') for row in rq.split('\n') if row.startswith('#+')]) query_string = "\n".join([row for row in rq.split('\n') if not row.startswith('#+')]) query_metadata = yaml.load(yaml_string) # If there is no YAML string if query_metadata == None: query_metadata = {} query_metadata['query'] = query_string try: parsed_query = translateQuery(Query.parseString(rq, parseAll=True)) except ParseException: app.logger.error("Could not parse query") app.logger.error(query_string) print traceback.print_exc() query_metadata['type'] = parsed_query.algebra.name if query_metadata['type'] == 'SelectQuery': query_metadata['variables'] = parsed_query.algebra['PV'] return query_metadata
def get_metadata(rq, endpoint): ''' Returns the metadata 'exp' parsed from the raw query file 'rq' 'exp' is one of: 'endpoint', 'tags', 'summary', 'request', 'pagination', 'enumerate' ''' query_metadata = get_yaml_decorators(rq) query_metadata['type'] = 'UNKNOWN' try: # THE PARSING # select, describe, construct, ask parsed_query = translateQuery(Query.parseString(rq, parseAll=True)) query_metadata['type'] = parsed_query.algebra.name if query_metadata['type'] == 'SelectQuery': # Projection variables query_metadata['variables'] = parsed_query.algebra['PV'] # Parameters query_metadata['parameters'] = get_parameters(rq, parsed_query.algebra['_vars'], endpoint, query_metadata) elif query_metadata['type'] == 'ConstructQuery': # Parameters query_metadata['parameters'] = get_parameters(rq, parsed_query.algebra['_vars'], endpoint, query_metadata) else: glogger.warning("Query type {} is currently unsupported and no metadata was parsed!".format(query_metadata['type'])) except ParseException: glogger.warning("Could not parse regular SELECT, CONSTRUCT, DESCRIBE or ASK query") # glogger.warning(traceback.print_exc()) # insert queries won't parse, so we regex # glogger.info("Trying to parse INSERT query") # if static.INSERT_PATTERN in rq: # query_metadata['type'] = 'InsertQuery' # query_metadata['parameters'] = [u'_g_iri'] try: # update query glogger.info("Trying to parse UPDATE query") parsed_query = UpdateUnit.parseString(rq, parseAll=True) glogger.info(parsed_query) query_metadata['type'] = parsed_query[0]['request'][0].name if query_metadata['type'] == 'InsertData': query_metadata['parameters'] = {'g': {'datatype': None, 'enum': [], 'lang': None, 'name': 'g', 'original': '?_g_iri', 'required': True, 'type': 'iri'}, 'data': {'datatype': None, 'enum': [], 'lang': None, 'name': 'data', 'original': '?_data', 'required': True, 'type': 'literal'}} glogger.info("Update query parsed with {}".format(query_metadata['type'])) # if query_metadata['type'] == 'InsertData': # query_metadata['variables'] = parsed_query.algebra['PV'] except: glogger.error("Could not parse query") glogger.error(query_metadata['query']) glogger.error(traceback.print_exc()) pass glogger.debug("Finished parsing query of type {}".format(query_metadata['type'])) glogger.debug("All parsed query metadata (from decorators and content): ") glogger.debug(pformat(query_metadata, indent=32)) return query_metadata
def get_parameters(rq): """ ?_name The variable specifies the API mandatory parameter name. The value is incorporated in the query as plain literal. ?__name The parameter name is optional. ?_name_iri The variable is substituted with the parameter value as a IRI (also: number or literal). ?_name_en The parameter value is considered as literal with the language 'en' (e.g., en,it,es, etc.). ?_name_integer The parameter value is considered as literal and the XSD datatype 'integer' is added during substitution. ?_name_prefix_datatype The parameter value is considered as literal and the datatype 'prefix:datatype' is added during substitution. The prefix must be specified according to the SPARQL syntax. """ variables = translateQuery(Query.parseString(rq, parseAll=True)).algebra['_vars'] ## Aggregates internal_matcher = re.compile("__agg_\d+__") ## Basil-style variables variable_matcher = re.compile("(?P<required>[_]{1,2})(?P<name>[^_]+)_?(?P<type>[a-zA-Z0-9]+)?_?(?P<userdefined>[a-zA-Z0-9]+)?.*$") parameters = {} for v in variables: if internal_matcher.match(v): continue match = variable_matcher.match(v) if match : vname = match.group('name') vrequired = True if match.group('required') == '_' else False vtype = 'iri' vlang = None vdatatype = None mtype = match.group('type') muserdefined = match.group('userdefined') if mtype in ['iri','number','literal']: vtype = mtype elif mtype: vtype = 'literal' if mtype: if mtype in XSD_DATATYPES: vdatatype = 'xsd:{}'.format(mtype) elif len(mtype) == 2 : vlang = mtype elif muserdefined : vdatatype = '{}:{}'.format(mtype, muserdefined) parameters[vname] = { 'original': '?{}'.format(v), 'required': vrequired, 'name': vname, 'type': vtype, 'datatype': vdatatype, 'lang': vlang } return parameters
def get_metadata(rq): ''' Returns the metadata 'exp' parsed from the raw query file 'rq' 'exp' is one of: 'endpoint', 'tags', 'summary', 'request', 'pagination', 'enumerate' ''' query_metadata = get_yaml_decorators(rq) try: # select, describe, construct, ask parsed_query = translateQuery(Query.parseString(rq, parseAll=True)) query_metadata['type'] = parsed_query.algebra.name if query_metadata['type'] == 'SelectQuery': query_metadata['variables'] = parsed_query.algebra['PV'] except ParseException: glogger.warning("Could not parse SELECT, DESCRIBE, CONSTRUCT, ASK query") # glogger.warning(traceback.print_exc()) pass try: # insert, update query glogger.info("Trying to parse update query") parsed_query = UpdateUnit.parseString(rq, parseAll=True) glogger.info(parsed_query) query_metadata['type'] = parsed_query[0]['request'][0].name glogger.info("Update query parsed with {}".format(query_metadata['type'])) # if query_metadata['type'] == 'InsertData': # query_metadata['variables'] = parsed_query.algebra['PV'] except: glogger.error("Could not parse UPDATE query") glogger.error(query_metadata['query']) glogger.error(traceback.print_exc()) pass glogger.info("Finished parsing query of type {}".format(query_metadata['type'])) return query_metadata
def process_shapes_file(shape_format, shapes_graph, rdf_file_path, repo_url, branch, repo_description): """Process a file, check its content and add entry to the shapes graph Large function, contain parsing for all formats: RDF, OBO, ShEx, OpenAPI, etc """ relative_filepath = str(rdf_file_path)[12:] github_file_url = generate_github_file_url(repo_url, relative_filepath, branch) file_uri = URIRef(github_file_url) shape_found = False g = Graph() if shape_format == 'obo': # Get OBO ontologies try: graph = obonet.read_obo(github_file_url) # for id_, data in graph.nodes(data=True): for id_, data in graph.nodes(data=True): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add( (file_uri, RDF.type, SIO['SIO_000623'])) # OBO ontology shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) shape_label = data.get('name') if not shape_label: shape_label = id_ shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(shape_label))) except Exception as e: add_to_report('In repository: ' + repo_url + "\n> " + str(e), github_file_url) # Index OpenAPI files elif shape_format == 'openapi': try: parser = ResolvingParser(github_file_url) shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SCHEMA['APIReference'])) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) file_descriptions = [] if parser.specification['info']['title']: file_descriptions.append(parser.specification['info']['title']) if parser.specification['info']['description']: file_descriptions.append( parser.specification['info']['description']) if len(file_descriptions) > 0: shapes_graph.add((file_uri, RDFS.comment, Literal(' - '.join(file_descriptions)))) # if not shape_label: # shape_label = id_ # TODO: get operations hasPart? shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('OpenAPI'))) except Exception as e: pass # TODO: YARRML? Search for prefixes and mappings at the root of YAML # add_to_report('In repository: ' + repo_url + "\n> " # + str(e), github_file_url) # Search for shex files elif shape_format == 'shex': # No parsing possible for shex shape_found = True # TODO: use https://schema.org/SoftwareSourceCode ? shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SHEX.Schema)) shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('ShEx model'))) shapes_graph.add((file_uri, SCHEMA.codeRepository, URIRef(repo_url))) # Convert ShEx to RDF shex and parse it # shex_rdf = '' # if rdf_file_path.endswith('.shex'): # with open(root / '../' + rdf_file_path, 'a') as f: # shex_rdf = generate_shexj.parse(f.read()) # # if rdf_file_path.endswith('.shexj'): # # with open(root / '../' + rdf_file_path, 'a') as f: # # shex_rdf = f.read() # logging.debug(shex_rdf) # # for shape in g.subjects(RDF.type, SHEX.ShapeAnd): # # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema) # # for shape in g.subjects(RDF.type, SHEX.Shape): # # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema) # Parse SPARQL query files elif shape_format == 'sparql': # TODO: sparql+queries search failing might be due to a test SPARQL query hanging for long time shape_found = True shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SH.SPARQLFunction)) shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, SCHEMA.codeRepository, URIRef(repo_url))) try: with open(rdf_file_path.absolute()) as file: sparql_query = file.read() # Parse SPARQL query (added fix for some malformed queries with =+ instead of #+) sparql_query = "\n".join([ '#+' + row.lstrip('=+') for row in sparql_query.split('\n') if row.startswith('=+') ]) yaml_string = "\n".join([ row.lstrip('#+') for row in sparql_query.split('\n') if row.startswith('#+') ]) query_string = "\n".join([ row for row in sparql_query.split('\n') if not row.startswith('#+') ]) shapes_graph.add( (file_uri, SCHEMA['query'], Literal(sparql_query))) grlc_metadata = {} try: # Invalid YAMLs will produce empty metadata grlc_metadata = yaml.load(yaml_string, Loader=yaml.FullLoader) except: pass # Get grlc query metadata if grlc_metadata: file_descriptions = [] if 'endpoint' in grlc_metadata: sparql_endpoint = grlc_metadata['endpoint'] try: shapes_graph.add((file_uri, VOID.sparqlEndpoint, URIRef(sparql_endpoint))) test_sparql_endpoint(sparql_endpoint) except Exception as e: logging.debug( 'Issue parsing SPARQL endpoint from .rq file') logging.debug(e) if 'summary' in grlc_metadata and grlc_metadata['summary']: file_descriptions.append(grlc_metadata['summary']) if 'description' in grlc_metadata and grlc_metadata[ 'description']: file_descriptions.append(grlc_metadata['description']) # Add the query description to the graph if len(file_descriptions) > 0: shapes_graph.add( (file_uri, RDFS.comment, Literal(' - '.join(file_descriptions)))) # If default params described for grlc SPARQL query we add them as shapes if 'defaults' in grlc_metadata: for args in grlc_metadata['defaults']: for arg, default_label in args.items(): shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(arg))) try: # Parse the query to get its operation (select, construct..) parsed_query = translateQuery( Query.parseString(query_string, parseAll=True)) query_operation = re.sub(r"(\w)([A-Z])", r"\1 \2", parsed_query.algebra.name) shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(query_operation))) except: shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal('SPARQL Query'))) except: logging.error('❌️ Issue opening file: ' + str(rdf_file_path)) # Parse RDF files else: try: if shape_format == 'trig': # Different graph required for trig to work g = ConjunctiveGraph() g.parse(str(rdf_file_path.absolute()), format=shape_format) except Exception as e: if shape_format == 'xml' and (str(rdf_file_path).endswith('.owl') or str(rdf_file_path).endswith('.rdf')): # Try parsing with turtle for .owl and .rdf files try: g.parse(str(rdf_file_path.absolute()), format='ttl') except: add_to_report( 'RDF parsed as ' + shape_format + ', in repository: ' + repo_url + "\n> " + str(e), github_file_url) else: add_to_report( 'RDF parsed as ' + shape_format + ', in repository: ' + repo_url + "\n> " + str(e), github_file_url) # Search for SHACL shapes for shape in g.subjects(RDF.type, SH.NodeShape): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SH.Shape)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the shape shape_label = label # Fixing shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) # Search for CSV on the Web RDF (csvw) # https://medium.swirrl.com/how-to-publish-csv-on-the-web-csvw-4ea6cbb603b4 # https://www.w3.org/ns/csvw for shape_file in g.subjects(RDF.type, CSVW.Schema): # for shape_file in g.objects(None, CSVW.tableSchema): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, CSVW.Schema)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) # Get file label # for file_label in g.objects(shape_file, RDFS.label): # shapes_graph.add((file_uri, RDFS.comment, Literal(str(file_label)))) # break # Get columns label for col_label in g.objects(shape_file, CSVW.column): shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(str(col_label)))) # Search for DCAT Datasets for shape_file in g.subjects(RDF.type, DCAT.Dataset): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, DCAT.Dataset)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) # Get file label for file_label in g.objects(shape_file, RDFS.label): shapes_graph.add( (file_uri, RDFS.comment, Literal(str(file_label)))) break # shape_label = shape_file # for label in g.objects(shape_file, RDFS.label): # # Try to get the label of the shape # shape_label = label # # Fixing # shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) # Search for nanopublication templates for shape_file in g.subjects(RDF.type, NP_TEMPLATE.AssertionTemplate): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add( (file_uri, RDF.type, NP_TEMPLATE.AssertionTemplate)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) # Get template label for template_label in g.objects(shape_file, RDFS.label): shapes_graph.add( (file_uri, RDFS.comment, Literal(str(template_label)))) break # TODO: get the shapes inside nanopub_inputs = [ NP_TEMPLATE.GuidedChoicePlaceholder, NP_TEMPLATE.LiteralPlaceholder, NP_TEMPLATE.RestrictedChoicePlaceholder, NP_TEMPLATE.UriPlaceholder ] for np_input in nanopub_inputs: for shape in g.subjects(RDF.type, np_input): shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the shape shape_label = label # Fixing shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(shape_label))) # Search for RML and R2RML mappings for shape in g.subjects(RDF.type, R2RML.SubjectMap): shape_found = True is_rml_mappings = False # Differenciate RML and R2RML mappings if (None, RML.logicalSource, None) in g: shapes_graph.add((file_uri, RDF.type, RML.LogicalSource)) else: shapes_graph.add((file_uri, RDF.type, R2RML.TriplesMap)) shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) shape_label = shape # Try to get the label or URI of the subjectMap for label in g.objects(shape, R2RML.template): shape_label = label for label in g.objects(shape, RDFS.label): shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) # Search for OWL classes for shape in g.subjects(RDF.type, OWL.Class): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, OWL.Ontology)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the class shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) # Get rdfs:label of owl:Ontology and shaclTest:Validate for file description file_descriptions = [] for shape in g.subjects(RDF.type, OWL.Ontology): # Get one of the labels for ontology_label in g.objects(shape, RDFS.label): if len(file_descriptions) < 1: file_descriptions.append(str(ontology_label)) if len(file_descriptions) == 0: for label in g.objects(shape, DC.title): file_descriptions.append(str(label)) if len(file_descriptions) == 0: for label in g.objects(shape, DCTERMS.title): file_descriptions.append(str(label)) # Now add the description for comment in g.objects(shape, RDFS.comment): file_descriptions.append(str(comment)) for label in g.objects(shape, RDFS.comment): file_descriptions.append(str(label)) for description in g.objects(shape, DCTERMS.description): file_descriptions.append(str(description)) for shape in g.subjects( RDF.type, URIRef('http://www.w3.org/ns/shacl-test#Validate')): for ontology_label in g.objects(shape, RDFS.label): file_descriptions.append(str(ontology_label)) if len(file_descriptions) > 0: shapes_graph.add((file_uri, RDFS.comment, Literal(' - '.join(file_descriptions)))) # Get SKOS concepts and concept scheme for shape in g.subjects(RDF.type, SKOS.Concept): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SKOS.ConceptScheme)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, SKOS.prefLabel): # Try to get the label of the class shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) for shape in g.subjects(RDF.type, SKOS.ConceptScheme): # Get one of the labels for ontology_label in g.objects(shape, RDFS.label): if len(file_descriptions) < 1: file_descriptions.append(str(ontology_label)) if len(file_descriptions) == 0: for label in g.objects(shape, DC.title): file_descriptions.append(str(label)) if len(file_descriptions) == 0: for label in g.objects(shape, DCTERMS.title): file_descriptions.append(str(label)) # Now add the description for comment in g.objects(shape, RDFS.comment): file_descriptions.append(str(comment)) for label in g.objects(shape, RDFS.comment): file_descriptions.append(str(label)) for description in g.objects(shape, DCTERMS.description): file_descriptions.append(str(description)) # Search for ShEx Shapes and ShapeAnd # TODO: Improve for shape in g.subjects(RDF.type, SHEX.ShapeAnd): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SHEX.Schema)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the shape shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) for shape in g.subjects(RDF.type, SHEX.Shape): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SHEX.Schema)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add( (file_uri, SCHEMA.codeRepository, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the shape shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) # Add the git repo to the graph if shape_found: logging.debug('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + '] ' + "✔️ Shape found in file " + github_file_url) shapes_graph.add((URIRef(repo_url), RDF.type, SCHEMA['DataCatalog'])) shapes_graph.add( (URIRef(repo_url), RDFS.label, Literal(repo_url.rsplit('/', 1)[1]))) if (repo_description): shapes_graph.add( (URIRef(repo_url), RDFS.comment, Literal(repo_description))) return shapes_graph
def get_parameters(rq, endpoint): """ ?_name The variable specifies the API mandatory parameter name. The value is incorporated in the query as plain literal. ?__name The parameter name is optional. ?_name_iri The variable is substituted with the parameter value as a IRI (also: number or literal). ?_name_en The parameter value is considered as literal with the language 'en' (e.g., en,it,es, etc.). ?_name_integer The parameter value is considered as literal and the XSD datatype 'integer' is added during substitution. ?_name_prefix_datatype The parameter value is considered as literal and the datatype 'prefix:datatype' is added during substitution. The prefix must be specified according to the SPARQL syntax. """ variables = translateQuery(Query.parseString( rq, parseAll=True)).algebra['_vars'] ## Aggregates internal_matcher = re.compile("__agg_\d+__") ## Basil-style variables variable_matcher = re.compile( "(?P<required>[_]{1,2})(?P<name>[^_]+)_?(?P<type>[a-zA-Z0-9]+)?_?(?P<userdefined>[a-zA-Z0-9]+)?.*$" ) parameters = {} for v in variables: if internal_matcher.match(v): continue match = variable_matcher.match(v) # TODO: currently only one parameter per triple pattern is supported tpattern_matcher = re.compile( ".*FROM\s+(?P<gnames>.*)\s+WHERE.*[\.\{][\n\t\s]*(?P<tpattern>.*\?" + re.escape(v) + ".*)\..*", flags=re.DOTALL) tp_match = tpattern_matcher.match(rq) if match: if tp_match: vtpattern = tp_match.group('tpattern') gnames = tp_match.group('gnames') glogger.debug("Matched triple pattern with parameter") # glogger.debug(vtpattern) # glogger.debug(gnames) codes_subquery = re.sub("SELECT.*\{.*\}", "SELECT DISTINCT ?" + v + " FROM " + gnames + " WHERE { " + vtpattern + " . }", rq, flags=re.DOTALL) headers = {'Accept': 'application/json'} data = {'query': codes_subquery} data_encoded = urllib.urlencode(data) req = urllib2.Request(endpoint, data_encoded, headers) glogger.debug("Sending code subquery request: " + req.get_full_url() + "?" + req.get_data()) response = urllib2.urlopen(req) codes_json = json.loads(response.read()) # glogger.debug(codes_json) vcodes = [] for code in codes_json['results']['bindings']: vcodes.append(code.values()[0]["value"]) # glogger.debug(vcodes) vname = match.group('name') vrequired = True if match.group('required') == '_' else False vtype = 'literal' vlang = None vdatatype = None mtype = match.group('type') muserdefined = match.group('userdefined') if mtype in ['iri', 'number', 'literal']: vtype = mtype elif mtype: vtype = 'literal' if mtype: if mtype in static.XSD_DATATYPES: vdatatype = 'xsd:{}'.format(mtype) elif len(mtype) == 2: vlang = mtype elif muserdefined: vdatatype = '{}:{}'.format(mtype, muserdefined) parameters[vname] = { 'original': '?{}'.format(v), 'required': vrequired, 'name': vname, 'enum': sorted(vcodes), 'type': vtype, 'datatype': vdatatype, 'lang': vlang } return parameters
def process_shapes_file(shape_format, shapes_graph, rdf_file_path, repo_url, branch, repo_description): """Process a Shapes file, check its content and add entry to the shapes graph Large function, contain parsing for all formats: RDF, OBO, ShEx, OpenAPI... """ relative_filepath = str(rdf_file_path)[12:] github_file_url = generate_github_file_url(repo_url, relative_filepath, branch) file_uri = URIRef(github_file_url) shape_found = False g = Graph() if shape_format == 'obo': # Get OBO ontologies try: graph = obonet.read_obo(github_file_url) # for id_, data in graph.nodes(data=True): for id_, data in graph.nodes(data=True): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add( (file_uri, RDF.type, SIO['SIO_000623'])) # OBO ontology shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) shape_label = data.get('name') if not shape_label: shape_label = id_ shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(shape_label))) except Exception as e: # print('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + '] 🗑 Issue with OBO parser for file ' + github_file_url) add_to_report('File: ' + github_file_url + "\n\n" + 'In repository: ' + repo_url + "\n> " + str(e) + "\n\n---\n") # Index OpenAPI files elif shape_format == 'openapi': try: parser = ResolvingParser(github_file_url) shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SCHEMA['WebAPI'])) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) file_descriptions = [] if parser.specification['info']['title']: file_descriptions.append(parser.specification['info']['title']) if parser.specification['info']['description']: file_descriptions.append( parser.specification['info']['description']) if len(file_descriptions) > 0: shapes_graph.add((file_uri, DC.description, Literal(' - '.join(file_descriptions)))) # if not shape_label: # shape_label = id_ # TODO: get operations hasPart? shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('OpenAPI'))) except Exception as e: pass # print('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + '] 🗑 Issue with OpenAPI parser for file ' + github_file_url) # print(e) # add_to_report('File: ' + github_file_url + "\n\n" # + 'In repository: ' + repo_url + "\n> " # + str(e) + "\n\n---\n") # Search for shex files elif shape_format == 'shex': # no parsing possible for shex shape_found = True # TODO: use https://schema.org/SoftwareSourceCode ? shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SHEX.Schema)) shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('ShEx model'))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) # Convert ShEx to RDF shex and parse it # shex_rdf = '' # if rdf_file_path.endswith('.shex'): # with open(root / '../' + rdf_file_path, 'a') as f: # shex_rdf = generate_shexj.parse(f.read()) # # if rdf_file_path.endswith('.shexj'): # # with open(root / '../' + rdf_file_path, 'a') as f: # # shex_rdf = f.read() # print(shex_rdf) # # for shape in g.subjects(RDF.type, SHEX.ShapeAnd): # # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema) # # for shape in g.subjects(RDF.type, SHEX.Shape): # # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema) # Parse SPARQL query files elif shape_format == 'sparql': shape_found = True shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SH.SPARQLFunction)) shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) with open(rdf_file_path.absolute()) as file: sparql_query = file.read() # Parse SPARQL query yaml_string = "\n".join([ row.lstrip('#+') for row in sparql_query.split('\n') if row.startswith('#+') ]) query_string = "\n".join([ row for row in sparql_query.split('\n') if not row.startswith('#+') ]) shapes_graph.add( (file_uri, SCHEMA['query'], Literal(query_string))) grlc_metadata = {} try: # Invalid YAMLs will produce empty metadata grlc_metadata = yaml.load(yaml_string, Loader=yaml.FullLoader) except: pass # Get metadata like grlc metadata if grlc_metadata: file_descriptions = [] if 'endpoint' in grlc_metadata: sparql_endpoint = grlc_metadata['endpoint'] shapes_graph.add((file_uri, VOID.sparqlEndpoint, Literal(sparql_endpoint))) # TODO: check if in hashes of already tested endpoints valid and failing3 # Test endpoint with SPARQLWrapper, add it to hash of valid or failing endpoints # Then, like repos, add them as schema:EntryPoint if sparql_endpoint not in VALID_ENDPOINTS.keys( ) and sparql_endpoint not in FAILED_ENDPOINTS.keys(): sparql_test_query = 'SELECT * WHERE { ?s ?p ?o } LIMIT 10' sparql = SPARQLWrapper(sparql_endpoint) sparql.setReturnFormat(JSON) sparql.setQuery(sparql_test_query) try: results = sparql.query().convert() # Check SPARQL query sent back at least 5 triples results_array = results["results"]["bindings"] if len(results_array) > 4: VALID_ENDPOINTS[sparql_endpoint] = { 'label': sparql_endpoint } else: FAILED_ENDPOINTS[sparql_endpoint] = 'failed' except Exception as e: add_to_report('SPARQL endpoint failed: ' + sparql_endpoint + "\n\n" + str(e) + "\n\n---\n") if 'summary' in grlc_metadata and grlc_metadata['summary']: file_descriptions.append(grlc_metadata['summary']) if 'description' in grlc_metadata and grlc_metadata[ 'description']: file_descriptions.append(grlc_metadata['description']) if len(file_descriptions) > 0: shapes_graph.add((file_uri, DC.description, Literal(' - '.join(file_descriptions)))) # If default params described for grlc SPARQL query we add then as shapes if 'defaults' in grlc_metadata: for args in grlc_metadata['defaults']: for arg, default_label in args.items(): shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(arg))) try: # Parse query to get its operation (select, construct..) parsed_query = translateQuery( Query.parseString(query_string, parseAll=True)) query_operation = re.sub(r"(\w)([A-Z])", r"\1 \2", parsed_query.algebra.name) shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal(query_operation))) except: shapes_graph.add( (file_uri, DCTERMS.hasPart, Literal('SPARQL Query'))) # Parse RDF files else: try: g.parse(str(rdf_file_path.absolute()), format=shape_format) except Exception as e: print('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + '] 🗑 RDF parser for ' + shape_format + ' did not worked for the file ' + github_file_url) if not str(rdf_file_path).endswith('.xml') and not str( rdf_file_path).endswith('.json'): add_to_report('File: ' + github_file_url + " parsed as " + shape_format + "\n\n" + 'In repository: ' + repo_url + "\n> " + str(e) + "\n\n---\n") # Search for SHACL shapes for shape in g.subjects(RDF.type, SH.NodeShape): # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape_uri, shape_type) shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SH.Shape)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the shape shape_label = label # Fixing shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) # Search for OWL classes, limit to max 300 classes/concepts retrieved classes_limit = 300 classes_count = 0 for shape in g.subjects(RDF.type, OWL.Class): # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape_uri, shape_type) shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, OWL.Ontology)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the class shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) classes_count += 1 if classes_count >= classes_limit: break # Get rdfs:label of owl:Ontology and shaclTest:Validate for file description file_descriptions = [] for shape in g.subjects(RDF.type, OWL.ontology): # Get one of the labels for ontology_label in g.objects(shape, RDFS.label): if len(file_descriptions) < 1: file_descriptions.append(str(ontology_label)) if len(file_descriptions) == 0: for label in g.objects(shape, DC.title): file_descriptions.append(str(label)) if len(file_descriptions) == 0: for label in g.objects(shape, DCTERMS.title): file_descriptions.append(str(label)) # Now add the description for comment in g.objects(shape, RDFS.comment): file_descriptions.append(str(comment)) for label in g.objects(shape, DC.description): file_descriptions.append(str(label)) for description in g.objects(shape, DCTERMS.description): file_descriptions.append(str(description)) for shape in g.subjects( RDF.type, URIRef('http://www.w3.org/ns/shacl-test#Validate')): for ontology_label in g.objects(shape, RDFS.label): file_descriptions.append(str(ontology_label)) if len(file_descriptions) > 0: shapes_graph.add((file_uri, DC.description, Literal(' - '.join(file_descriptions)))) # Get SKOS concepts and concept scheme classes_count = 0 for shape in g.subjects(RDF.type, SKOS.Concept): # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape_uri, shape_type) shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SKOS.ConceptScheme)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, SKOS.prefLabel): # Try to get the label of the class shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) classes_count += 1 if classes_count >= classes_limit: break for shape in g.subjects(RDF.type, SKOS.ConceptScheme): # Get one of the labels for ontology_label in g.objects(shape, RDFS.label): if len(file_descriptions) < 1: file_descriptions.append(str(ontology_label)) if len(file_descriptions) == 0: for label in g.objects(shape, DC.title): file_descriptions.append(str(label)) if len(file_descriptions) == 0: for label in g.objects(shape, DCTERMS.title): file_descriptions.append(str(label)) # Now add the description for comment in g.objects(shape, RDFS.comment): file_descriptions.append(str(comment)) for label in g.objects(shape, DC.description): file_descriptions.append(str(label)) for description in g.objects(shape, DCTERMS.description): file_descriptions.append(str(description)) # Search for ShEx Shapes and ShapeAnd # TODO: Improve for shape in g.subjects(RDF.type, SHEX.ShapeAnd): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SHEX.Schema)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the shape shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) for shape in g.subjects(RDF.type, SHEX.Shape): shape_found = True shapes_graph.add( (file_uri, RDF.type, SCHEMA['SoftwareSourceCode'])) shapes_graph.add((file_uri, RDF.type, SHEX.Schema)) shapes_graph.add( (file_uri, RDFS.label, Literal(rdf_file_path.name))) shapes_graph.add((file_uri, DC.source, URIRef(repo_url))) shape_label = shape for label in g.objects(shape, RDFS.label): # Try to get the label of the shape shape_label = label shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label))) # Add repository RDF if shape_found: shapes_graph.add( (URIRef(repo_url), RDF.type, SCHEMA['codeRepository'])) # TODO: change, schema:codeRepository is a property, not a class, but not much available.. shapes_graph.add( (URIRef(repo_url), RDFS.label, Literal(repo_url.rsplit('/', 1)[1]))) if (repo_description): shapes_graph.add( (URIRef(repo_url), RDFS.comment, Literal(repo_description))) return shapes_graph