] # Additional files that should be scanned can be added here # Look at all Turtle files inputFiles = [ path.join(root, name) for root, dirs, files in walk(ttlFolder) for name in files if name.endswith((".ttl")) ] # Add additional files inputFiles.extend(additionalFiles) # Identify LOC identifiers by looking at triples where the object is a LCO identifier locIdentifiers = [] for file in tqdm(inputFiles): if file.endswith('.trig'): g = rdflib.Dataset() g.parse(file) queryResults = g.query("""SELECT DISTINCT ?loc WHERE { GRAPH ?g { ?s ?p ?loc . } FILTER(REGEX(STR(?loc),"http://id.loc.gov/")) }""") else: g = rdflib.Graph() g.parse(file) queryResults = g.query("""SELECT DISTINCT ?loc WHERE { ?s ?p ?loc . FILTER(REGEX(STR(?loc),"http://id.loc.gov/")) }""") for row in queryResults:
from testharness import * from sdoutil import * import api from apimarkdown import Markdown import StringIO rdflib.plugin.register("json-ld", Parser, "rdflib_jsonld.parser", "JsonLDParser") rdflib.plugin.register("json-ld", Serializer, "rdflib_jsonld.serializer", "JsonLDSerializer") ATTIC = 'attic' VOCAB = None VOCABLEN = 0 ALTVOCAB = "https://schema.org" STORE = rdflib.Dataset() #Namespace mapping############# nss = {'core': 'http://schema.org/'} revNss = {} NSSLoaded = False allLayersList = [] context_data = "data/internal-context" #Local file containing context to be used loding .jsonld files RDFLIBLOCK = threading.Lock() #rdflib uses generators which are not threadsafe from rdflib.namespace import RDFS, RDF, OWL SCHEMA = rdflib.Namespace('http://schema.org/') QUERYGRAPH = None
store = getMasterStore() read_schemas(loadExtensions=True) read_extensions(sdoapp.ENABLED_EXTENSIONS) graphs = list(store.graphs()) from rdflib.namespace import RDFS def MdComments(g): #Process Markdown for s, p, o in list(g.triples((None, RDFS.comment, None))): no = MD.parse(o) #g.remove((s,p,o)) g.set((s, p, Literal(no))) outGraph = rdflib.Dataset() simpleFormat = False if args.format == "xml" or args.format == "nt" or args.format == "turtle": simpleFormat = True outGraph = rdflib.Graph() gs = sorted(list(store.graphs()), key=lambda u: u.identifier) for g in gs: #Put core first if str(g.identifier) == "http://schema.org/": gs.remove(g) gs.insert(0, g) break for g in gs: id = str(g.identifier)
def main(): """Main entry point for the dfs CLI.""" args = docopt(__doc__, version=__version__) csvfile = args["FILE"] PROV = Namespace("http://www.w3.org/ns/prov#") QUDT = Namespace("http://qudt.org/schema/qudt#") UNIT = Namespace("http://qudt.org/1.1/vocab/unit#") COMPONENT = Namespace("http://crc.nd.edu/schema/component#") ds = rdflib.Dataset(default_union=True) ds.bind("prov", PROV) ds.bind("qudt", QUDT) ds.bind("component", COMPONENT) with open(csvfile, 'rb') as f: reader = csv.reader(f) # id[0], namegbxml[1], namearch[2], iswindow[3], # thickness[4], embodiedenergy[5], eeunit_id[6], # matdensityarch[7], matdensitygbxml[8], densityunit_id[9], # unitcostmat[10], unitcostmle[11], unitcostttl[12], # financialunit_id[13], lifeexpectancy[14], maintenancefactor[15] # infosource[16], confidence[17] for row in reader: # generate new uuid for component componentid = 'urn:green-matdb:' + str(uuid.uuid4()) ds.add((URIRef(componentid), RDF.type, COMPONENT.Component)) ds.add((URIRef(componentid), COMPONENT.gbxmlname, Literal(row[1]))) ds.add((URIRef(componentid), COMPONENT.archname, Literal(row[2]))) # Check to see if this guy is a window if (row[3] == '1'): ds.add((URIRef(componentid), RDF.type, COMPONENT.Window)) # Check to see if we have a thickness # thicknessid = 'urn:green-matdb:' + str(uuid.uuid4()) thicknessid = BNode() ds.add((URIRef(componentid), COMPONENT.hasThickness, thicknessid)) ds.add((thicknessid, RDF.type, QUDT.QuantityValue)) ds.add((thicknessid, QUDT.numericValue, Literal(row[4], datatype=XSD.float))) ds.add((thicknessid, QUDT.unit, UNIT.Inch)) embodiedenergy = BNode() ds.add((URIRef(componentid), COMPONENT.hasEmbodiedEnergy, embodiedenergy)) ds.add((embodiedenergy, RDF.type, QUDT.QuantityValue)) ds.add((embodiedenergy, QUDT.numericValue, Literal(row[5], datatype=XSD.float))) if (row[6] == '1'): ds.add((embodiedenergy, QUDT.unit, UNIT.BtuPerPound)) elif (row[6] == '2'): # This QUDT unit doesn't exist. Unit is JoulePerKilogram. # Need to create new derived unit. ds.add((embodiedenergy, QUDT.unit, UNIT.MegaJoulePerKilogram)) materialdensityArch = BNode() ds.add((URIRef(componentid), COMPONENT.hasMaterialDensity, materialdensityArch)) ds.add((materialdensityArch, COMPONENT.hasSource, COMPONENT.Archsource)) ds.add((materialdensityArch, RDF.type, QUDT.QuantityValue)) ds.add((materialdensityArch, QUDT.numericValue, Literal(row[7], datatype=XSD.float))) if (row[9] == '1'): ds.add((materialdensityArch, QUDT.unit, UNIT.KilogramPerCubicMeter)) elif (row[9] == '2'): ds.add( (materialdensityArch, QUDT.unit, UNIT.PoundPerCubicFoot)) materialdensitygbxml = BNode() ds.add((URIRef(componentid), COMPONENT.hasMaterialDensity, materialdensitygbxml)) ds.add((materialdensityArch, COMPONENT.hasSource, COMPONENT.gbxmlsource)) ds.add((materialdensitygbxml, RDF.type, QUDT.QuantityValue)) ds.add((materialdensitygbxml, QUDT.numericValue, Literal(row[8], datatype=XSD.float))) if (row[9] == '1'): ds.add((materialdensitygbxml, QUDT.unit, UNIT.KilogramPerCubicMeter)) elif (row[9] == '2'): ds.add( (materialdensitygbxml, QUDT.unit, UNIT.PoundPerCubicFoot)) unitcostmat = BNode() ds.add((URIRef(componentid), COMPONENT.hasUnitCost, unitcostmat)) ds.add((unitcostmat, RDF.type, QUDT.QuantityValue)) ds.add((unitcostmat, QUDT.numericValue, Literal(row[10], datatype=XSD.float))) unitcostMLE = BNode() print(ds.serialize(format="turtle"))
def mix_datasets(base_ds: ConjunctiveLike, extra_ds: GraphLike, target_ds: Optional[Union[ConjunctiveLike, str]] = None): """ Make a clone of base_ds (dataset) and add in the triples from extra_ds (dataset) :param base_ds: :type base_ds: rdflib.Dataset :param extra_ds: :type extra_ds: rdflib.Dataset :param target_ds: :type target_ds: rdflib.Dataset|str|NoneType :return: The cloned Dataset with mixed in triples from extra_ds :rtype: rdflib.Dataset """ default_union = base_ds.default_union base_named_graphs = list(base_ds.contexts()) if target_ds is None: target_ds = rdflib.Dataset(default_union=default_union) elif isinstance(target_ds, rdflib.ConjunctiveGraph): raise RuntimeError( "Cannot mix new graphs into a ConjunctiveGraph, use Dataset instead." ) elif target_ds == "inplace": pass # do nothing here elif not isinstance(target_ds, rdflib.Dataset): raise RuntimeError( "Cannot mix datasets if target_ds passed in is not a Dataset itself." ) if isinstance(extra_ds, (rdflib.Dataset, rdflib.ConjunctiveGraph)): mixin_graphs = list(extra_ds.contexts()) else: mixin_graphs = [extra_ds] if target_ds == "inplace": target_ds = base_ds for mg in mixin_graphs: mod_named_graphs = { g.identifier: mix_graphs(g, mg, target_graph="inplace") for g in base_named_graphs } elif isinstance(target_ds, str): raise RuntimeError( "target_ds cannot be a string (unless it is 'inplace')") else: mixed_graphs = {} for mg in mixin_graphs: mod_named_graphs = { g.identifier: mix_graphs(g, mg, target_graph=rdflib.Graph(store=target_ds.store, identifier=g.identifier)) for g in base_named_graphs } mixed_graphs.update(mod_named_graphs) default_context_id = target_ds.default_context.identifier for i, m in mixed_graphs.items(): if i == default_context_id: target_ds.store.remove_graph(target_ds.default_context) target_ds.default_context = m target_ds.add_graph(m) return target_ds
def load_from_source( source: Union[GraphLike, BufferedIOBase, TextIOBase, BinaryIO, Union[str, bytes]], g: Optional[GraphLike] = None, rdf_format: Optional[str] = None, multigraph: bool = False, do_owl_imports: Union[bool, int] = False, import_chain: Optional[List[Union[rdflib.URIRef, str]]] = None, ): """ :param source: :param g: :type g: rdflib.Graph | None :param rdf_format: :type rdf_format: str :param multigraph: :type multigraph: bool :param do_owl_imports: :type do_owl_imports: bool|int :param import_chain: :type import_chain: list | None :return: """ source_is_graph = False open_source: Optional[Union[BufferedIOBase, BinaryIO]] = None source_was_open: bool = False source_as_file: Optional[Union[BufferedIOBase, BinaryIO]] = None source_as_filename: Optional[str] = None source_as_bytes: Optional[bytes] = None filename = None public_id = None uri_prefix = None is_imported_graph = do_owl_imports and isinstance( do_owl_imports, int) and do_owl_imports > 1 if isinstance(source, (rdflib.Graph, rdflib.ConjunctiveGraph, rdflib.Dataset)): source_is_graph = True if g is None: g = source else: raise RuntimeError( "Cannot pass in both target=rdflib.Graph/Dataset and g=graph.") elif isinstance(source, (BufferedIOBase, TextIOBase)): if hasattr(source, 'name'): filename = source.name # type: ignore public_id = Path(filename).resolve().as_uri() + "#" if isinstance(source, TextIOBase): buf = getattr(source, "buffer") # type: BufferedIOBase source_as_file = source = buf else: source_as_file = source if hasattr(source, 'closed'): if not bool(source.closed): open_source = source source_was_open = True else: # Assume it is open now and it was open when we started. open_source = source source_was_open = True elif isinstance(source, str): pid = os.getpid() fd0 = "/proc/{}/fd/0".format(str(pid)) if is_windows and source.startswith('file:///'): public_id = source filename = source[8:] source_as_filename = filename elif not is_windows and source.startswith('file://'): public_id = source filename = source[7:] source_as_filename = filename elif source.startswith('http:') or source.startswith('https:'): public_id = source try: resp, rdf_format = get_rdf_from_web(source) except HTTPError: if is_imported_graph: return g else: raise if rdf_format == 'graph': source = resp source_is_graph = True else: filename = resp.geturl() fp = resp.fp # type: BufferedIOBase source_was_open = False source = open_source = fp else: first_char = source[0] if is_windows and (first_char == '\\' or (len(source) > 3 and source[1:3] == ":\\")): filename = source source_as_filename = filename elif first_char == '/' or (len(source) > 2 and source[0:2] == "./"): filename = source source_as_filename = filename elif (first_char == '#' or first_char == '@' or first_char == '<' or first_char == '\n' or first_char == '{' or first_char == '['): # Contains some JSON or XML or Turtle chars, it's not a path source_as_file = None source_as_filename = None elif len(source) >= 32 and '\n' in source[:32]: # Contains a new line near the start of the file, can't be a path source_as_file = None source_as_filename = None elif len(source) < 140: filename = source source_as_filename = filename if source_as_filename and filename: if filename == "stdin" or filename == "/dev/stdin" or filename == "-" or filename == fd0: source = source_as_file = open_source = sys.stdin.buffer source_was_open = True else: try: filename = os.readlink(filename) if filename == fd0 or filename == "/dev/stdin": source = source_as_file = open_source = sys.stdin.buffer source_was_open = True except OSError: pass # TODO: Do we still need this? Not sure why this was added, but works better without it # if public_id and not public_id.endswith('#'): # public_id = "{}#".format(public_id) if not source_as_file and not source_as_filename and not open_source and isinstance( source, str): # source is raw RDF data. source_as_bytes = source = source.encode('utf-8') elif isinstance(source, bytes): if source.startswith(b'file:') or source.startswith( b'http:') or source.startswith(b'https:'): raise ValueError( "file:// and http:// strings should be given as str, not bytes." ) first_char_b: bytes = source[0:1] if (first_char_b == b'#' or first_char_b == b'@' or first_char_b == b'<' or first_char_b == b'\n' or first_char_b == b'{' or first_char_b == b'['): # Contains some JSON or XML or Turtle stuff source_as_file = None source_as_filename = None elif len(source) < 140: filename = source.decode('utf-8') source_as_filename = filename if not source_as_file and not source_as_filename and not open_source: source_as_bytes = source else: raise ValueError("Cannot determine the format of the input graph") if g is None: if source_is_graph: target_g: Union[rdflib.Graph, rdflib.ConjunctiveGraph, rdflib.Dataset] = source # type: ignore else: target_g = rdflib.Dataset() if multigraph else rdflib.Graph() else: if not isinstance( g, (rdflib.Graph, rdflib.Dataset, rdflib.ConjunctiveGraph)): raise RuntimeError( "Passing in 'g' must be a rdflib Graph or Dataset.") target_g = g if filename: if filename.endswith('.ttl'): rdf_format = rdf_format or 'turtle' elif filename.endswith('.nt'): rdf_format = rdf_format or 'nt' elif filename.endswith('.n3'): rdf_format = rdf_format or 'n3' elif filename.endswith('.json'): rdf_format = rdf_format or 'json-ld' elif filename.endswith('.nq') or filename.endswith('.nquads'): rdf_format = rdf_format or 'nquads' elif filename.endswith('.trig'): rdf_format = rdf_format or 'trig' elif filename.endswith('.xml') or filename.endswith('.rdf'): rdf_format = rdf_format or 'xml' if source_as_filename and filename is not None and not open_source: filename = str(Path(filename).resolve()) if not public_id: public_id = Path(filename).as_uri() + "#" source = open_source = open(filename, mode='rb') if not open_source and source_as_bytes: source = open_source = BytesIO(source_as_bytes) # type: ignore if open_source: _source = open_source # Check if we can seek try: _source.seek(0) # type: ignore except (AttributeError, UnsupportedOperation): # Read it all into memory new_bytes = BytesIO(_source.read()) if not source_was_open: _source.close() source = _source = new_bytes source_was_open = False if rdf_format is None: line = _source.readline().lstrip() if len(line) > 15: line = line[:15] line = line.lower() if line.startswith(b"<!doctype html") or line.startswith(b"<html"): raise RuntimeError("Attempted to load a HTML document as RDF.") if line.startswith(b"<?xml") or line.startswith( b"<xml") or line.startswith(b"<rdf:"): rdf_format = "xml" if line.startswith(b"@base ") or line.startswith( b"@prefix ") or line.startswith(b"PREFIX "): rdf_format = "turtle" try: _source.seek(0) except (AttributeError, UnsupportedOperation): raise RuntimeError("Seek failed while identifying file type.") except ValueError: raise RuntimeError("File closed while identifying file type.") if rdf_format == 'turtle' or rdf_format == 'n3': # SHACL Shapes files and Data files can have extra RDF Metadata in the # Top header block, including #BaseURI and #Prefix. # The @base line is not read here, but it is parsed in the n3 parser while True: try: line = _source.readline() assert line is not None and len(line) > 0 except AssertionError: break # Strip line from start while len( line) > 0 and line[0:1] in b' \t\n\r\x0B\x0C\x85\xA0': line = line[1:] # We reached the end of the line, check the next line if len(line) < 1: continue # If this is not a comment, then this is the first non-comment line, we're done. if not line[0:1] == b'#': break # Strip from start again, but now removing hashes too. while len(line) > 0 and line[0:1] in b'# \t\xA0': line = line[1:] # Strip line from end while len( line) > 0 and line[-1:] in b' \t\n\r\x0B\x0C\x85\xA0': line = line[:-1] spl = line.split(b':', 1) if len(spl) < 2: continue keyword = spl[0].lower() # Strip keyword end while len(keyword ) > 0 and keyword[-1:] in b' \t\n\r\x0B\x0C\x85\xA0': keyword = keyword[:-1] if len(keyword) < 1: continue wordval = spl[1] # Strip wordval start while len(wordval ) > 0 and wordval[0:1] in b' \t\n\r\x0B\x0C\x85\xA0': wordval = wordval[1:] if len(wordval) < 1: continue wordval_str = wordval.decode('utf-8') if keyword == b"baseuri": public_id = wordval_str elif keyword == b"prefix": uri_prefix = wordval_str try: _source.seek(0) except (AttributeError, UnsupportedOperation): raise RuntimeError( "Seek failed while pre-parsing Turtle File.") except ValueError: raise RuntimeError( "File closed while pre-parsing Turtle File.") target_g.parse(source=_source, format=rdf_format, publicID=public_id) # If the target was open to begin with, leave it open. if not source_was_open: _source.close() elif hasattr(_source, 'seek'): try: _source.seek(0) except (AttributeError, UnsupportedOperation): pass except ValueError: # The parser closed our file! pass source_is_graph = True elif source_is_graph and (target_g != source): # clone source into g if isinstance( target_g, (rdflib.Dataset, rdflib.ConjunctiveGraph)) and isinstance( source, (rdflib.Dataset, rdflib.ConjunctiveGraph)): clone_dataset(source, target_g) elif isinstance(target_g, rdflib.Graph) and isinstance( source, (rdflib.Dataset, rdflib.ConjunctiveGraph)): raise RuntimeError( "Cannot load a Dataset source into a Graph target.") elif isinstance( target_g, (rdflib.Dataset, rdflib.ConjunctiveGraph)) and isinstance( source, rdflib.Graph): target = rdflib.Graph(store=target_g.store, identifier=public_id) clone_graph(source, target) elif isinstance(target_g, rdflib.Graph) and isinstance( source, rdflib.Graph): clone_graph(source, target_g) else: raise RuntimeError("Cannot merge source graph into target graph.") if not source_is_graph: raise RuntimeError("Error opening graph from source.") if public_id: if uri_prefix: if is_imported_graph and uri_prefix == '': # Don't reassign blank prefix, when importing subgraph pass else: has_named_prefix = target_g.store.namespace(uri_prefix) if not has_named_prefix: target_g.namespace_manager.bind(uri_prefix, public_id) elif not is_imported_graph: existing_blank_prefix = target_g.store.namespace('') if not existing_blank_prefix: target_g.namespace_manager.bind('', public_id) if do_owl_imports: if isinstance(do_owl_imports, int): if do_owl_imports > 3: return target_g else: do_owl_imports = 1 if import_chain is None: import_chain = [] if public_id and (public_id.endswith('#') or public_id.endswith('/')): root_id: Union[rdflib.URIRef, None] = rdflib.URIRef(public_id[:-1]) else: root_id = rdflib.URIRef(public_id) if public_id else None done_imports = 0 if root_id is not None: if isinstance(target_g, (rdflib.ConjunctiveGraph, rdflib.Dataset)): gs = list(target_g.contexts()) else: gs = [target_g] for ng in gs: owl_imports = list(ng.objects(root_id, rdflib.OWL.imports)) if len(owl_imports) > 0: import_chain.append(root_id) for o in owl_imports: if o in import_chain: continue load_from_source( o, g=target_g, multigraph=multigraph, do_owl_imports=do_owl_imports + 1, import_chain=import_chain, ) done_imports += 1 if done_imports < 1 and public_id is not None and root_id != public_id: public_id_uri = rdflib.URIRef(public_id) if isinstance(target_g, (rdflib.ConjunctiveGraph, rdflib.Dataset)): gs = list(target_g.contexts()) else: gs = [target_g] for ng in gs: owl_imports = list( ng.objects(public_id_uri, rdflib.OWL.imports)) if len(owl_imports) > 0: import_chain.append(public_id_uri) for o in owl_imports: if o in import_chain: continue load_from_source( o, g=target_g, multigraph=multigraph, do_owl_imports=do_owl_imports + 1, import_chain=import_chain, ) done_imports += 1 if done_imports < 1: if isinstance(target_g, (rdflib.ConjunctiveGraph, rdflib.Dataset)): gs = list(target_g.contexts()) else: gs = [target_g] for ng in gs: ontologies = ng.subjects(rdflib.RDF.type, rdflib.OWL.Ontology) for ont in ontologies: if ont == root_id or ont == public_id: continue if ont in import_chain: continue owl_imports = list(ng.objects(ont, rdflib.OWL.imports)) if len(owl_imports) > 0: import_chain.append(ont) for o in owl_imports: if o in import_chain: continue load_from_source( o, g=target_g, multigraph=multigraph, do_owl_imports=do_owl_imports + 1, import_chain=import_chain, ) done_imports += 1 return target_g
def main(fp='data/onstage.nt'): # If there was no format issue in the streets data, this function would # work. Instead, download the data yourself and point to it: # datasets = downloadDatasets(datasets=(GEBOUWEN, PERSONS, WIJKEN)) dsG = rdflib.Dataset() # rdflib Dataset rdfSubject.db = dsG # hook onto rdfAlchemy TITLE = ["ONSTAGE"] DESCRIPTION = [ Literal( """Online Datasystem of Theatre in Amsterdam from the Golden Age to the present. This is your address for questions about the repertoire, performances, popularity and revenues of the cultural program in Amsterdam’s public theatre during the period 1637 - 1772. All data provided in this system links to archival source materials in contemporary administration. The [Shows page](http://www.vondel.humanities.uva.nl/onstage/shows/) gives you access by date to chronological lists of the theater program, and the plays staged per day. At the [Plays page](http://www.vondel.humanities.uva.nl/onstage/plays/) you have access to the repertoire by title, and for each play you will find its performances and revenues throughout time. At the [Persons page](http://www.vondel.humanities.uva.nl/onstage/persons/) you can access the data for playwrights, actors and actresses, and translators involved in the rich national and international variety of the Amsterdam Theater productions. Go see your favorite play!""", lang='en') ] DATE = Literal(datetime.datetime.now().strftime('%Y-%m-%d'), datatype=XSD.datetime) ds = Dataset( create.term('id/onstage/'), label=TITLE, name=TITLE, dctitle=TITLE, description=DESCRIPTION, dcdescription=DESCRIPTION, image=URIRef( "http://www.vondel.humanities.uva.nl/onstage/images/logo.png"), url=[URIRef("http://www.vondel.humanities.uva.nl/onstage/")], temporalCoverage=[Literal("1637-01-01/1772-12-31")], spatialCoverage=[Literal("Amsterdam")], dateModified=DATE, dcdate=DATE, dcmodified=DATE, licenseprop=URIRef( "https://creativecommons.org/publicdomain/zero/1.0/")) # Add the datasets as separate graphs. Metadata on these graphs is in the # default graph. guri = create.term('id/onstage/') # download = DataDownload(None, # contentUrl=URIRef(uri), # encodingFormat="application/turtle") g = rdflib.Graph(identifier=guri) g.bind('schema', schema) g.bind('foaf', foaf) g.bind('dcterms', dcterms) g.bind('owl', OWL) g.bind('pnv', Namespace('https://w3id.org/pnv#')) g.bind( 'onstage', Namespace('http://www.vondel.humanities.uva.nl/onstage/lod/vocab/#')) g.bind('bio', Namespace('http://purl.org/vocab/bio/0.1/')) g.bind('sem', Namespace('http://semanticweb.cs.vu.nl/2009/11/sem/#')) g.bind('skos', Namespace('http://www.w3.org/2004/02/skos/core#')) g.bind('time', Namespace('http://www.w3.org/2006/time#')) g.parse(fp, format='nt') dsG.add_graph(g) ds.triples = sum(1 for i in g.subjects()) dsG.bind('void', void) dsG.bind('dcterms', dcterms) dsG.bind('schema', schema) print("Serializing!") dsG.serialize('datasets/onstage.trig', format='trig')
def load_from_source(source, g=None, rdf_format=None, multigraph=False, do_owl_imports=False, import_chain=None): """ :param source: :param g: :type g: rdflib.Graph :param rdf_format: :type rdf_format: str :param multigraph: :type multigraph: bool :param do_owl_imports: :type do_owl_imports: bool|int :param import_chain: :type import_chain: dict :return: """ source_is_graph = False source_is_open = False source_was_open = False source_is_file = False source_is_bytes = False filename = None public_id = None uri_prefix = None is_imported_graph = do_owl_imports and isinstance(do_owl_imports, int) \ and do_owl_imports > 1 if isinstance(source, (rdflib.Graph, rdflib.ConjunctiveGraph, rdflib.Dataset)): source_is_graph = True if g is None: g = source else: raise RuntimeError( "Cannot pass in both target=rdflib.Graph/Dataset and g=graph.") elif isinstance(source, IOBase) and hasattr(source, 'read'): source_is_file = True if hasattr(source, 'closed'): source_is_open = not bool(source.closed) source_was_open = source_is_open else: # Assume it is open now and it was open when we started. source_is_open = True source_was_open = True filename = source.name public_id = Path(filename).resolve().as_uri() + "#" elif isinstance(source, str): if is_windows and source.startswith('file:///'): public_id = source source_is_file = True filename = source[8:] elif not is_windows and source.startswith('file://'): public_id = source source_is_file = True filename = source[7:] elif source.startswith('http:') or source.startswith('https:'): public_id = source try: source, rdf_format = get_rdf_from_web(source) except HTTPError: if is_imported_graph: return g else: raise source_is_open = True filename = source.geturl() else: first_char = source[0] if is_windows and (first_char == '\\' or (len(source) > 3 and source[1:3] == ":\\")): source_is_file = True filename = source elif first_char == '/' or source[0:3] == "./": source_is_file = True filename = source elif first_char == '#' or first_char == '@' \ or first_char == '<' or first_char == '\n' \ or first_char == '{' or first_char == '[': # Contains some JSON or XML or Turtle stuff source_is_file = False elif len(source) < 140: source_is_file = True filename = source if public_id and not public_id.endswith('#'): public_id = "{}#".format(public_id) if not source_is_file and not source_is_open: source = source.encode('utf-8') source_is_bytes = True elif isinstance(source, bytes): if (is_windows and source.startswith(b'file:///')) or \ (not is_windows and source.startswith(b'file://')) or \ source.startswith(b'http:') or source.startswith(b'https:'): raise ValueError( "file:// and http:// strings should be given as str, not bytes." ) first_char = source[0:1] if first_char == b'#' or first_char == b'@' \ or first_char == b'<' or first_char == b'\n' \ or first_char == b'{' or first_char == b'[': # Contains some JSON or XML or Turtle stuff source_is_file = False elif len(source) < 140: source_is_file = True filename = source.decode('utf-8') if not source_is_file: source_is_bytes = True else: raise ValueError("Cannot determine the format of the input graph") if g is None: g = rdflib.Dataset() if multigraph else rdflib.Graph() else: if not isinstance( g, (rdflib.Graph, rdflib.Dataset, rdflib.ConjunctiveGraph)): raise RuntimeError("Passing in g must be a Graph.") if filename: if filename.endswith('.ttl'): rdf_format = rdf_format or 'turtle' elif filename.endswith('.nt'): rdf_format = rdf_format or 'nt' elif filename.endswith('.n3'): rdf_format = rdf_format or 'n3' elif filename.endswith('.json'): rdf_format = rdf_format or 'json-ld' elif filename.endswith('.nq') or filename.endswith('.nquads'): rdf_format = rdf_format or 'nquads' elif filename.endswith('.trig'): rdf_format = rdf_format or 'trig' elif filename.endswith('.xml') or filename.endswith('.rdf'): rdf_format = rdf_format or 'xml' if source_is_file and filename and not source_is_open: filename = Path(filename).resolve() if not public_id: public_id = Path(filename).as_uri() + "#" source = open(filename, mode='rb') source_is_open = True if source_is_open: data = source.read() # If the target was open to begin with, leave it open. if not source_was_open: source.close() elif hasattr(source, 'seek'): try: source.seek(0) except Exception: pass source = data source_is_bytes = True if source_is_bytes: source = BytesIO(source) if (rdf_format == "json-ld" or rdf_format == "json") and not has_json_ld: raise RuntimeError( "Cannot load a JSON-LD file if rdflib_jsonld is not installed." ) if rdf_format == 'turtle' or rdf_format == 'n3': # SHACL Shapes files and Data files can have extra RDF Metadata in the # Top header block, including #BaseURI and #Prefix. while True: try: l = source.readline() assert l is not None and len(l) > 0 except AssertionError: break # Strip line from start while len(l) > 0 and l[0:1] in b' \t\n\r\x0B\x0C\x85\xA0': l = l[1:] # We reached the end of the line, check the next line if len(l) < 1: continue # If this is not a comment, then this is the first non-comment line, we're done. if not l[0:1] == b'#': break # Strip from start again, but now removing hashes too. while len(l) > 0 and l[0:1] in b'# \t\xA0': l = l[1:] # Strip line from end while len(l) > 0 and l[-1:] in b' \t\n\r\x0B\x0C\x85\xA0': l = l[:-1] spl = l.split(b':', 1) if len(spl) < 2: continue keyword = spl[0].lower() # Strip keyword end while len(keyword ) > 0 and keyword[-1:] in b' \t\n\r\x0B\x0C\x85\xA0': keyword = keyword[:-1] if len(keyword) < 1: continue wordval = spl[1] # Strip wordval start while len(wordval ) > 0 and wordval[0:1] in b' \t\n\r\x0B\x0C\x85\xA0': wordval = wordval[1:] if len(wordval) < 1: continue wordval = wordval.decode('utf-8') if keyword == b"baseuri": public_id = wordval elif keyword == b"prefix": uri_prefix = wordval source.seek(0) g.parse(source=source, format=rdf_format, publicID=public_id) source_is_graph = True if not source_is_graph: raise RuntimeError("Error opening graph from source.") if public_id: if uri_prefix: if is_imported_graph and uri_prefix == '': # Don't reassign blank prefix, when importing subgraph pass else: has_named_prefix = g.store.namespace(uri_prefix) if not has_named_prefix: g.namespace_manager.bind(uri_prefix, public_id) elif not is_imported_graph: existing_blank_prefix = g.store.namespace('') if not existing_blank_prefix: g.namespace_manager.bind('', public_id) if do_owl_imports: if isinstance(do_owl_imports, int): if do_owl_imports > 3: return g else: do_owl_imports = 1 if import_chain is None: import_chain = [] if public_id and (public_id.endswith('#') or public_id.endswith('/')): root_id = rdflib.URIRef(public_id[:-1]) else: root_id = rdflib.URIRef(public_id) if public_id else None done_imports = 0 if root_id is not None: if isinstance(g, (rdflib.ConjunctiveGraph, rdflib.Dataset)): gs = list(g.contexts()) else: gs = [g] for ng in gs: owl_imports = list(ng.objects(root_id, rdflib.OWL.imports)) if len(owl_imports) > 0: import_chain.append(root_id) for o in owl_imports: if o in import_chain: continue load_from_source(o, g=g, multigraph=multigraph, do_owl_imports=do_owl_imports + 1, import_chain=import_chain) done_imports += 1 if done_imports < 1 and public_id is not None and root_id != public_id: public_id_uri = rdflib.URIRef(public_id) if isinstance(g, (rdflib.ConjunctiveGraph, rdflib.Dataset)): gs = list(g.contexts()) else: gs = [g] for ng in gs: owl_imports = list( ng.objects(public_id_uri, rdflib.OWL.imports)) if len(owl_imports) > 0: import_chain.append(public_id_uri) for o in owl_imports: if o in import_chain: continue load_from_source(o, g=g, multigraph=multigraph, do_owl_imports=do_owl_imports + 1, import_chain=import_chain) done_imports += 1 if done_imports < 1: if isinstance(g, (rdflib.ConjunctiveGraph, rdflib.Dataset)): gs = list(g.contexts()) else: gs = [g] for ng in gs: ontologies = ng.subjects(rdflib.RDF.type, rdflib.OWL.Ontology) for ont in ontologies: if ont == root_id or ont == public_id: continue if ont in import_chain: continue owl_imports = list(ng.objects(ont, rdflib.OWL.imports)) if len(owl_imports) > 0: import_chain.append(ont) for o in owl_imports: if o in import_chain: continue load_from_source(o, g=g, multigraph=multigraph, do_owl_imports=do_owl_imports + 1, import_chain=import_chain) done_imports += 1 return g
def main(): # If there was no format issue in the streets data, this function would # work. Instead, download the data yourself and point to it: # datasets = downloadDatasets(datasets=(GEBOUWEN, PERSONS, WIJKEN)) datasets = [ ('https://adamlink.nl/data/rdf/streets', 'data/adamlinkstraten.ttl'), ('https://adamlink.nl/data/rdf/buildings', 'data/adamlinkgebouwen.ttl'), ('https://adamlink.nl/data/rdf/districts', 'data/adamlinkbuurten.ttl'), ('https://adamlink.nl/data/rdf/persons', 'data/adamlinkpersonen.ttl') ] dsG = rdflib.Dataset() # rdflib Dataset rdfSubject.db = dsG # hook onto rdfAlchemy TITLE = ["Adamlink"] DESCRIPTION = [ Literal( """Adamlink, een project van [Stichting AdamNet](http://www.adamnet.nl), wil Amsterdamse collecties verbinden en als LOD beschikbaar maken. Om collecties te verbinden hebben we identifiers ([URIs](https://nl.wikipedia.org/wiki/Uniform_resource_identifier)) voor concepten als straten, personen en gebouwen nodig. Vaak zijn die al beschikbaar, bijvoorbeeld in de [BAG](https://nl.wikipedia.org/wiki/Basisregistraties_Adressen_en_Gebouwen), [RKDartists](https://rkd.nl/nl/explore/artists) of [Wikidata](https://www.wikidata.org). Hier voegen we onze eigen Adamlink URIs aan die identifiers toe. Niet omdat we die beter vinden dan BAG, RKDartists of Wikidata, maar omdat bepaalde concepten - verdwenen straten bijvoorbeeld - niet in genoemde authority sets terug te vinden zijn. En omdat we op Adamlink allerlei naamvarianten van concepten bijeen kunnen brengen. We proberen Adamlink als hub laten fungeren, door bijvoorbeeld bij een straat naar zowel BAG als Wikidata te verwijzen. Regelmatig nemen we data eerst op Adamlink op, bijvoorbeeld alle geportretteerden die we in de beeldbank van het Stadsarchief tegenkomen, om die personen vervolgens (zowel scriptsgewijs als handmatig) te verbinden met bestaande authority sets als Wikidata, Ecartico of RKDartists. Maakt en publiceert u data met (historische) straat-, gebouw- of persoonsnamen? Gebruik dan altijd een identifier die door zoveel mogelijk anderen ook gebruikt wordt. U heeft dan toegang tot alle andere informatie die over zo'n concept beschikbaar is, zoals naamsvarianten of de locatie of de tijd waarin het concept leefde of bestond. En u verbindt uw data ook met de collecties van Amsterdamse erfgoedinstellingen.""", lang='nl'), Literal("Reference data for Amsterdam collections.", lang='en') ] DATE = Literal(datetime.datetime.now().strftime('%Y-%m-%d'), datatype=XSD.datetime) ds = Dataset(create.term('id/adamlink/'), label=TITLE, name=TITLE, dctitle=TITLE, description=DESCRIPTION, dcdescription=DESCRIPTION, image=URIRef("https://adamlink.nl/img/footerimg.jpg"), url=[URIRef("https://www.adamlink.nl/")], temporalCoverage=[Literal("1275-10-27/..")], spatialCoverage=[Literal("Amsterdam")], dateModified=DATE, dcdate=DATE, dcmodified=DATE) subdatasets = [] # Add the datasets as separate graphs. Metadata on these graphs is in the # default graph. for uri, fp in datasets: graphtype = uri.replace(PREFIX, '') guri = create.term('id/adamlink/' + graphtype + '/') TITLE = [f"Adamlink {graphtype.title()}"] DESCRIPTION = [ Literal( f"Data over {graphtype} uit Adamlink - Referentiedata voor Amsterdamse collecties.", lang='nl'), Literal( f"Data on {graphtype} from Adamlink - Reference data for Amsterdam collections.", lang='en') ] download = DataDownload(None, contentUrl=URIRef(uri), encodingFormat="application/turtle") subds = Dataset(guri, label=TITLE, name=TITLE, dctitle=TITLE, description=DESCRIPTION, dcdescription=DESCRIPTION, url=[URIRef("https://www.adamlink.nl/")], temporalCoverage=[Literal("1275-10-27/..")], spatialCoverage=[Literal("Amsterdam")], distribution=[download]) # Add data to the respective graph print("Parsing", uri) subgraph = rdflib.Graph(identifier=guri) subgraph.parse(fp, format='turtle') dsG.add_graph(subgraph) subdatasets.append(subds) print("Adding more meta data and dataset relations") for subds in subdatasets: subds.isPartOf = ds subds.inDataset = ds subds.triples = sum(1 for i in subgraph.subjects()) ds.hasPart = subdatasets ds.subset = subdatasets ds.triples = sum( 1 for i in dsG.graph(identifier=create.term('id/adamlink/')).subjects()) dsG.bind('void', void) dsG.bind('dcterms', dcterms) dsG.bind('schema', schema) print("Serializing!") dsG.serialize('datasets/adamlink.trig', format='trig')