def find_arcp(base_path): # First try to find External-Identifier bag = bagit.Bag(base_path) ext_id = bag.info.get("External-Identifier") if arcp.is_arcp_uri(ext_id): return ext_id raise Exception("Can't find External-Identifier")
def find_arcp(self): # First try to find External-Identifier bag = bagit.Bag(self.folder) ext_id = bag.info.get("External-Identifier") if arcp.is_arcp_uri(ext_id): return ext_id else: return arcp.arcp_random()
def format_id(self, identifier): if is_arcp_uri(identifier): return identifier else: # check if it's an absolute URL url = urlparse(identifier) if all([url.scheme, url.netloc, url.path]): return identifier elif identifier.startswith('#'): return identifier else: return '#' + identifier
def check_ro(self): manifest_file = os.path.join(self.folder, "metadata", "manifest.json") self.assertTrue(os.path.isfile(manifest_file), "Can't find " + manifest_file) arcp_root = self.find_arcp() base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() with open(manifest_file, "rb") as f: # Note: This will use https://w3id.org/bundle/context g.parse(file=f, format="json-ld", publicID=base) print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="nt") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break self.assertTrue(ro, "Can't find RO with ore:isDescribedBy") profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break self.assertTrue(profile, "Can't find profile with dct:conformsTo") self.assertEquals(profile, URIRef("https://w3id.org/cwl/prov/0.3.0"), "Unexpected cwlprov version " + profile) paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): print(aggregate) if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue # arcp URIs - assume they are local to our RO path = arcp.parse_arcp(aggregate).path[1:] # Strip first / paths.append(path) # Convert to local path, in case it uses \ on Windows lpath = provenance._convert_path(path, posixpath, os.path) lfile = os.path.join(self.folder, lpath) self.assertTrue(os.path.isfile(lfile), "Can't find aggregated " + lfile) self.assertTrue(paths, "Didn't find any arcp aggregates") self.assertTrue(externals, "Didn't find any data URIs") for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext self.assertTrue(f in paths, "provenance file missing " + f) for f in ["workflow/primary-job.json", "workflow/packed.cwl"]: self.assertTrue(f in paths, "workflow file missing " + f)
def check_bagit(self): # check bagit structure for f in ("bagit.txt", "bag-info.txt", "manifest-sha1.txt", "tagmanifest-sha1.txt", "tagmanifest-sha256.txt"): f = os.path.join(self.folder, f) self.assertTrue(os.path.isfile(f)) bag = bagit.Bag(self.folder) self.assertTrue(bag.has_oxum()) (only_manifest, only_fs) = bag.compare_manifests_with_fs() self.assertFalse(list(only_manifest), "Some files only in manifest") self.assertFalse(list(only_fs), "Some files only on file system") missing_tagfiles = bag.missing_optional_tagfiles() self.assertFalse(list(missing_tagfiles), "Some files only in tagmanifest") bag.validate() # TODO: Check other bag-info attributes self.assertTrue(arcp.is_arcp_uri(bag.info.get("External-Identifier")))
def resolve_path(self, uri_path): if arcp.is_arcp_uri(str(uri_path)): uri = arcp.parse_arcp(uri_path) # Ensure same base URI meaning this bagit assert urllib.parse.urljoin(uri_path, "/") == self.root_uri # Strip initial / so path is relative path = pathlib.PurePosixPath(uri.path[1:]) else: path = pathlib.PurePosixPath(uri_path) assert not path.is_absolute() if not str(path) in self.bag.entries: raise IOError("Not found in bag manifest/tagmanifest: %s" % uri_path) # resolve as OS-specific path absolute = pathlib.Path(self.root_path, path) # ensure it did not climb out (will throw ValueError if not) assert absolute.relative_to(self.root_path) return absolute
def check_bagit(base_path): # check bagit structure required_files = [ "bagit.txt", "bag-info.txt", "manifest-sha1.txt", "tagmanifest-sha1.txt", "tagmanifest-sha256.txt"] for basename in required_files: file_path = os.path.join(base_path, basename) assert os.path.isfile(file_path) bag = bagit.Bag(base_path) assert bag.has_oxum() (only_manifest, only_fs) = bag.compare_manifests_with_fs() assert not list(only_manifest), "Some files only in manifest" assert not list(only_fs), "Some files only on file system" missing_tagfiles = bag.missing_optional_tagfiles() assert not list(missing_tagfiles), "Some files only in tagmanifest" bag.validate() # TODO: Check other bag-info attributes assert arcp.is_arcp_uri(bag.info.get("External-Identifier"))
def check_ro(base_path, nested=False): manifest_file = os.path.join(base_path, "metadata", "manifest.json") assert os.path.isfile(manifest_file), "Can't find " + manifest_file arcp_root = find_arcp(base_path) base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() # Avoid resolving JSON-LD context https://w3id.org/bundle/context # so this test works offline context = Path(get_data("tests/bundle-context.jsonld")).as_uri() with open(manifest_file, "r", encoding="UTF-8") as f: jsonld = f.read() # replace with file:/// URI jsonld = jsonld.replace("https://w3id.org/bundle/context", context) g.parse(data=jsonld, format="json-ld", publicID=base) if os.environ.get("DEBUG"): print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="ttl") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break assert ro is not None, "Can't find RO with ore:isDescribedBy" profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break assert profile is not None, "Can't find profile with dct:conformsTo" assert profile == URIRef(provenance.CWLPROV_VERSION),\ "Unexpected cwlprov version " + profile paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue lfile = _arcp2file(base_path, aggregate) paths.append(os.path.relpath(lfile, base_path)) assert os.path.isfile(lfile), "Can't find aggregated " + lfile assert paths, "Didn't find any arcp aggregates" assert externals, "Didn't find any data URIs" for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext assert f in paths, "provenance file missing " + f for f in [ "workflow/primary-job.json", "workflow/packed.cwl", "workflow/primary-output.json" ]: assert f in paths, "workflow file missing " + f # Can't test snapshot/ files directly as their name varies # TODO: check urn:hash::sha1 thingies # TODO: Check OA annotations packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl") primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json") primary_prov_nt = urllib.parse.urljoin( arcp_root, "/metadata/provenance/primary.cwlprov.nt") uuid = arcp.parse_arcp(arcp_root).uuid highlights = set(g.subjects(OA.motivatedBy, OA.highlighting)) assert highlights, "Didn't find highlights" for h in highlights: assert (h, OA.hasTarget, URIRef(packed)) in g describes = set(g.subjects(OA.motivatedBy, OA.describing)) for d in describes: assert (d, OA.hasBody, URIRef(arcp_root)) in g assert (d, OA.hasTarget, URIRef(uuid.urn)) in g linked = set(g.subjects(OA.motivatedBy, OA.linking)) for l in linked: assert (l, OA.hasBody, URIRef(packed)) in g assert (l, OA.hasBody, URIRef(primary_job)) in g assert (l, OA.hasTarget, URIRef(uuid.urn)) in g has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt))) for p in has_provenance: assert (p, OA.hasTarget, URIRef(uuid.urn)) in g assert (p, OA.motivatedBy, PROV.has_provenance) in g # Check all prov elements are listed formats = set() for prov in g.objects(p, OA.hasBody): assert (prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g # NOTE: DC.format is a Namespace method and does not resolve like other terms formats.update(set(g.objects(prov, DC["format"]))) assert formats, "Could not find media types" expected = set( Literal(f) for f in ("application/json", "application/ld+json", "application/n-triples", 'text/provenance-notation; charset="UTF-8"', 'text/turtle; charset="UTF-8"', "application/xml")) assert formats == expected, "Did not match expected PROV media types" if nested: # Check for additional PROVs # Let's try to find the other wf run ID otherRuns = set() for p in g.subjects(OA.motivatedBy, PROV.has_provenance): if (p, OA.hasTarget, URIRef(uuid.urn)) in g: continue otherRuns.update(set(g.objects(p, OA.hasTarget))) assert otherRuns, "Could not find nested workflow run prov annotations"
def _find_arcp(self): ext_id = self.bag.info.get("External-Identifier") if ext_id and arcp.is_arcp_uri(ext_id): return ext_id else: return arcp.arcp_random()
def check_ro(base_path, nested=False): manifest_file = os.path.join(base_path, "metadata", "manifest.json") assert os.path.isfile(manifest_file), "Can't find " + manifest_file arcp_root = find_arcp(base_path) base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() # Avoid resolving JSON-LD context https://w3id.org/bundle/context # so this test works offline context = Path(get_data("tests/bundle-context.jsonld")).as_uri() with open(manifest_file, "r", encoding="UTF-8") as f: jsonld = f.read() # replace with file:/// URI jsonld = jsonld.replace("https://w3id.org/bundle/context", context) g.parse(data=jsonld, format="json-ld", publicID=base) if os.environ.get("DEBUG"): print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="ttl") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break assert ro is not None, "Can't find RO with ore:isDescribedBy" profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break assert profile is not None, "Can't find profile with dct:conformsTo" assert profile == URIRef(provenance.CWLPROV_VERSION),\ "Unexpected cwlprov version " + profile paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue lfile = _arcp2file(base_path, aggregate) paths.append(os.path.relpath(lfile, base_path)) assert os.path.isfile(lfile), "Can't find aggregated " + lfile assert paths, "Didn't find any arcp aggregates" assert externals, "Didn't find any data URIs" for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext assert f in paths, "provenance file missing " + f for f in ["workflow/primary-job.json", "workflow/packed.cwl", "workflow/primary-output.json"]: assert f in paths, "workflow file missing " + f # Can't test snapshot/ files directly as their name varies # TODO: check urn:hash::sha1 thingies # TODO: Check OA annotations packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl") primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json") primary_prov_nt = urllib.parse.urljoin(arcp_root, "/metadata/provenance/primary.cwlprov.nt") uuid = arcp.parse_arcp(arcp_root).uuid highlights = set(g.subjects(OA.motivatedBy, OA.highlighting)) assert highlights, "Didn't find highlights" for h in highlights: assert (h, OA.hasTarget, URIRef(packed)) in g describes = set(g.subjects(OA.motivatedBy, OA.describing)) for d in describes: assert (d, OA.hasBody, URIRef(arcp_root)) in g assert (d, OA.hasTarget, URIRef(uuid.urn)) in g linked = set(g.subjects(OA.motivatedBy, OA.linking)) for l in linked: assert (l, OA.hasBody, URIRef(packed)) in g assert (l, OA.hasBody, URIRef(primary_job)) in g assert (l, OA.hasTarget, URIRef(uuid.urn)) in g has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt))) for p in has_provenance: assert (p, OA.hasTarget, URIRef(uuid.urn)) in g assert (p, OA.motivatedBy, PROV.has_provenance) in g # Check all prov elements are listed formats = set() for prov in g.objects(p, OA.hasBody): assert (prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g # NOTE: DC.format is a Namespace method and does not resolve like other terms formats.update(set(g.objects(prov, DC["format"]))) assert formats, "Could not find media types" expected = set(Literal(f) for f in ( "application/json", "application/ld+json", "application/n-triples", 'text/provenance-notation; charset="UTF-8"', 'text/turtle; charset="UTF-8"', "application/xml" )) assert formats == expected, "Did not match expected PROV media types" if nested: # Check for additional PROVs # Let's try to find the other wf run ID otherRuns = set() for p in g.subjects(OA.motivatedBy, PROV.has_provenance): if (p, OA.hasTarget, URIRef(uuid.urn)) in g: continue otherRuns.update(set(g.objects(p, OA.hasTarget))) assert otherRuns, "Could not find nested workflow run prov annotations"
def check_ro(self, nested=False): manifest_file = os.path.join(self.folder, "metadata", "manifest.json") self.assertTrue(os.path.isfile(manifest_file), "Can't find " + manifest_file) arcp_root = self.find_arcp() base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() with open(manifest_file, "rb") as f: # Note: This will use https://w3id.org/bundle/context g.parse(file=f, format="json-ld", publicID=base) if os.environ.get("DEBUG"): print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="nt") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break self.assertTrue(ro, "Can't find RO with ore:isDescribedBy") profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break self.assertTrue(profile, "Can't find profile with dct:conformsTo") self.assertEquals(profile, URIRef(provenance.CWLPROV_VERSION), "Unexpected cwlprov version " + profile) paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue lfile = self._arcp2file(aggregate) paths.append(os.path.relpath(lfile, self.folder)) self.assertTrue(os.path.isfile(lfile), "Can't find aggregated " + lfile) self.assertTrue(paths, "Didn't find any arcp aggregates") self.assertTrue(externals, "Didn't find any data URIs") for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext self.assertTrue(f in paths, "provenance file missing " + f) for f in ["workflow/primary-job.json", "workflow/packed.cwl"]: self.assertTrue(f in paths, "workflow file missing " + f) # Can't test snapshot/ files directly as their name varies # TODO: check urn:hash::sha1 thingies # TODO: Check OA annotations packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl") primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json") primary_prov_nt = urllib.parse.urljoin( arcp_root, "/metadata/provenance/primary.cwlprov.nt") uuid = arcp.parse_arcp(arcp_root).uuid highlights = set(g.subjects(OA.motivatedBy, OA.highlighting)) self.assertTrue(highlights, "Didn't find highlights") for h in highlights: self.assertTrue((h, OA.hasTarget, URIRef(packed)) in g) describes = set(g.subjects(OA.motivatedBy, OA.describing)) for d in describes: self.assertTrue((d, OA.hasBody, URIRef(arcp_root)) in g) self.assertTrue((d, OA.hasTarget, URIRef(uuid.urn)) in g) linked = set(g.subjects(OA.motivatedBy, OA.linking)) for l in linked: self.assertTrue((l, OA.hasBody, URIRef(packed)) in g) self.assertTrue((l, OA.hasBody, URIRef(primary_job)) in g) self.assertTrue((l, OA.hasTarget, URIRef(uuid.urn)) in g) has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt))) for p in has_provenance: self.assertTrue((p, OA.hasTarget, URIRef(uuid.urn)) in g) self.assertTrue((p, OA.motivatedBy, PROV.has_provenance) in g) # Check all prov elements are listed formats = set() for prov in g.objects(p, OA.hasBody): self.assertTrue((prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g) # NOTE: DC.format is a Namespace method and does not resolve like other terms formats.update(set(g.objects(prov, DC["format"]))) self.assertTrue(formats, "Could not find media types") expected = set( Literal(f) for f in ("application/json", "application/ld+json", "application/n-triples", 'text/provenance-notation; charset="UTF-8"', 'text/turtle; charset="UTF-8"', "application/xml")) self.assertEquals(formats, expected, "Did not match expected PROV media types") if nested: # Check for additional PROVs # Let's try to find the other wf run ID otherRuns = set() for p in g.subjects(OA.motivatedBy, PROV.has_provenance): if (p, OA.hasTarget, URIRef(uuid.urn)) in g: continue otherRuns.update(set(g.objects(p, OA.hasTarget))) self.assertTrue( otherRuns, "Could not find nested workflow run prov annotations")