def autoid_report_error(id, blob): try: return idlib.Auto(id) except idlib.exc.MalformedIdentifierError as e: msg = f'{blob["id"]} bad id: {id}' logd.error(msg) return None
def export_identifier_metadata(self, dump_path, dataset_blobs): if (self.latest and self.latest_id_met_path.exists()): blob_id_met = self.latest_id_met else: def fetch(id): # FIXME error proof version ... try: metadata = id.metadata() metadata['id'] = id.identifier # FIXME normalization ... return metadata except requests.exceptions.HTTPError as e: logd.error(e) except (requests.exceptions.ConnectionError, requests.exceptions.SSLError) as e: log.error(e) # retrieve doi metadata and materialize it in the dataset _dois = set([ idlib.Auto(id) if not isinstance(id, idlib.Stream) else id for blob in dataset_blobs for id in chain( adops.get(blob, ['meta', 'protocol_url_or_doi'], on_failure=[]), adops.get(blob, ['meta', 'originating_article_doi'], on_failure=[]), # TODO data["links"]? [blob['meta']['doi']]) if id is not None ]) dois = [d for d in _dois if isinstance(d, idlib.Doi)] metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois) bads = [ { 'id': d, 'reason': 'no metadata' } # TODO more granular reporting e.g. 404 for d, m in zip(dois, metadatas) if m is None ] metadatas = [m for m in metadatas if m is not None] blob_id_met = { 'id': 'identifier-metadata', # TODO is this ok ? 'identifier_metadata': metadatas, 'errors': bads, 'meta': { 'count': len(metadatas) }, 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), 'export_project_path': self.export_source_path.cache.anchor, }, } with open(dump_path / 'identifier-metadata.json', 'wt') as f: json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode) return blob_id_met
def triples(self): crossref_doi_pred = rdflib.term.URIRef('http://prismstandard.org/namespaces/basic/2.1/doi') for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) if not hasattr(id, 'asUri'): breakpoint() s = id.asUri(rdflib.URIRef) if 'source' in blob: source = blob['source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance(id, idlib.Doi) else id # FIXME idlib streams need to recognize their own type in __new__ data = doi.ttl() if data is None: # blackfynn has some bad settings on their doi records ... return try: g.parse(data=data, format='ttl') # FIXME network bad except BaseException as e: loge.exception(e) _tr = [s for s, p, o in g if p == crossref_doi_pred] if _tr: _their_record_s = _tr[0] yield s, owl.sameAs, _their_record_s yield from g else: g.debug() log.critical('No crossref doi section in graph!') else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance(oraw, rdflib.URIRef) else oraw yield s, p, o
def triples(self): for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) s = id.asType(rdflib.URIRef) if 'source' in blob: source = blob[ 'source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance( id, idlib.Doi ) else id # FIXME idlib streams need to recognize their own type in __new__ g.parse(data=doi.ttl(), format='ttl') # FIXME network bad _their_record_s = [ s for s, p, o in g if p == rdflib.term.URIRef( 'http://prismstandard.org/namespaces/basic/2.1/doi' ) ][0] yield s, owl.sameAs, _their_record_s yield from g else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance( oraw, rdflib.URIRef) else oraw yield s, p, o