def process_dataset_description(self): log.info("# Processing dataset description.") input_data = self.dataset_description log.debug(f"Input data: {input_data}") if not dataset_description_validator(input_data): log.error( "The dataset description document did not validate. These errors were " "reported:" ) log.error(pformat(dataset_description_validator.errors)) raise SystemExit(1) log.debug("Input data validated.") self.data_provider = URIRef(input_data["data_provider"]) self.file_namespace = file_namespace = input_data["file_namespace"] self.creation_uuid_ns = uuid.uuid5(uuid.NAMESPACE_URL, self.file_namespace) # initialize graph self.graph_uuid = uuid.uuid5(uuid.NAMESPACE_URL, file_namespace) graph_iri = f"{ENTITIES_NAMESPACE}{self.graph_uuid}" graph = self.graph = Graph(identifier=graph_iri) # describe graph s = URIRef(graph_iri) for p, o in [ (RDF.type, m4p0.RDFGraph), (RDFS.label, Literal(f"{file_namespace} @ {self.import_time_string}")), (edm.dataProvider, self.data_provider), (dc.date, Literal(self.import_time)), ]: graph.add((s, p, o))
def __init__(self, path: Path, config: SimpleNamespace): log.info(f"Setting up import from {path}") self.config = config self.import_time = datetime.now() self.import_time_string = self.import_time.isoformat(timespec="seconds") log_folder = path / "logs" try: log_folder.mkdir(exist_ok=True) except FileNotFoundError: log.error(f"The import folder '{path}' doesn't exist. Aborting.") raise SystemExit(1) set_file_log_handler(log_folder / f"{self.import_time_string}.log") self.dataset_description = yaml.load( (path / "dataset.yml").read_text(), Loader=yaml.SafeLoader ) self.source_files: Dict[str, Path] = {} for name in ("images", "audio_video", "3d", "entities"): data_file_path = path / f"{name}.csv" if data_file_path.exists(): self.source_files[name] = data_file_path if not any(x in self.source_files for x in ("images", "audio_video", "3d")): log.error( "At least one of 'images.csv', 'audio_video.csv' or '3d.csv' " "must be present in an import folder." ) raise SystemExit(1) self.graph: Optional[Graph] = None self.creation_iris: Set[URIRef] = set() self.creation_uuid_ns: Optional[uuid.UUID] = None self.encountered_filenames: Set[str] = set()
def process_3d_data(self): source_file = self.source_files.get("3d") if source_file is None: log.debug("No 3D objects' metadata found.") return log.info("# Processing 3D objects' metadata.") self.process_metadata_file(source_file, self.add_3d_fields, _3d_validator) log.info("Done.")
def process_images_data(self): source_file = self.source_files.get("images") if source_file is None: log.debug("No images' metadata found.") return log.info("# Processing images' metadata.") self.process_metadata_file(source_file, self.add_core_fields, coreset_validator) log.info("Done.")
def process_entities_data(self): if self.source_files.get("entities") is None: log.debug("No entities' metadata found.") return log.info("# Processing entities' metadata.") graph = self.graph with self.source_files["entities"].open("rt", newline="") as f: csv_reader = csv.DictReader(f) for row in csv_reader: identifier = row.get("Identifier") if not entity_validator(row): log.error( "An entity description did not validate. These errors " f"were reported for the identifier {identifier}:" ) log.error(pformat(entity_validator.errors)) raise SystemExit(1) s = self.create_related_entity_iri(identifier) if len(set(graph.triples((None, m4p0.refersToMuseumObject, s)))) < 1: log.error( "This identifier is not referenced in the metadata of any " f"digital object in the created graph: {identifier}" ) raise SystemExit(1) label = Literal(row["Bezeichnung"]) graph.add((s, RDF.type, m4p0.MuseumObject)) graph.add((s, m4p0.museumObjectTitle, label)) graph.add((s, RDFS.label, label)) if "URL" in row: graph.add((s, edm.isShownAt, URIRef(row["URL"]))) arbritrary_fields = { k: v for k, v in row.items() if k not in entity_description_schema } if arbritrary_fields: blank_node = BNode() graph.add((blank_node, RDF.type, m4p0.JSONObject)) graph.add( ( blank_node, m4p0.jsonData, Literal(json.dumps(arbritrary_fields)), ) ) graph.add((s, m4p0.isDescribedBy, blank_node)) log.info("Done.")
def process_audio_video_data(self): source_file = self.source_files.get("audio_video") if source_file is None: log.debug("No audios' or videos' metadata found.") return log.info("# Processing audios' and videos' metadata") self.process_metadata_file( source_file, self.add_audio_video_fields, audio_video_validator ) log.info("Done.")
def post_query(self, query: str): response = httpx.post( self.config.sparql_endpoint, auth=(self.config.sparql_user, self.config.sparql_pass), data=query.encode(), headers={ "Content-Type": "application/sparql-update; charset=UTF-8", "Accept": "text/boolean", }, ) try: response.raise_for_status() except Exception: log.exception("Something went wrong") raise SystemExit(1) else: log.info(f"Received response: {response.content.decode()}")
def submit(self): # submit the triples to the SPARQL-endpoint log.info("# Submitting graph data via SPARQL.") graph_iri = self.graph.identifier turtle_representation: str = self.graph.serialize( format="turtle" ).decode().splitlines() deletion_query = f"""\ DELETE {{?s ?p ?o}} WHERE {{ GRAPH <{graph_iri}> {{?s ?p ?o}} }} """ prefixes = [] for i, line in enumerate(turtle_representation): if line.startswith("@prefix "): prefixes.append("PREFIX " + line[8:-2]) else: break prefixes_header = "\n".join(prefixes) + "\n" statements = "\n".join(turtle_representation[i + 1 :]) # noqa: E203 insert_query = f"""\ {prefixes_header} INSERT {{ GRAPH <{graph_iri}> {{ {statements} }} }} WHERE {{}} """ if self.config.review: pager(insert_query) review_passed = input("Proceed? [yN]: ").lower() if not review_passed or review_passed[0] != "y": log.critical("User aborted after reviewing the SPARQL query.") raise SystemExit(1) log.debug("Generated SPARQL Query:") log.debug(insert_query) log.info(f"Deleting all existing triples from the graph <{graph_iri}>.") self.post_query(deletion_query) log.info( f"Posting generated triples to {self.config.sparql_endpoint} as " f"{self.config.sparql_user}." ) self.post_query(insert_query)