def __init__(self, g_set=None, ts_url=None, base_dir=None, base_iri=None, default_dir="_", tmp_dir=None, context_map={}, dir_split=0, n_file_item=1): self.g = Graph() self.base_dir = base_dir self.base_iri = base_iri self.storer = Storer(context_map=context_map) self.tmp_dir = tmp_dir self.dir_split = dir_split self.n_file_item = n_file_item self.name = "SPACIN " + self.__class__.__name__ self.loaded = set() self.default_dir = default_dir if g_set is not None: self.update_graph_set(g_set) if ts_url is None: self.ts = None else: self.ts = ConjunctiveGraph('SPARQLUpdateStore') self.ts.open((ts_url, ts_url))
def fix_reference(timestamp, accept, citing, cited, reference): rf, of, cp, cdh = create_resources() s = Storer(cp.graph_set(), context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) r_text = unquote(reference) g_add_be = Graph(identifier=base_iri + "be/") g_remove_be = Graph(identifier=base_iri + "be/") g_add_br = Graph(identifier=base_iri + "br/") g_remove_br = Graph(identifier=base_iri + "br/") ref_res = rf.retrieve_reference(base_iri + citing, base_iri + cited) g_add_be.add((ref_res, GraphEntity.has_content, Literal(r_text))) ref_res_text = rf.retrieve_reference_text(ref_res) g_remove_be.add((ref_res, GraphEntity.has_content, ref_res_text)) if accept == "false": citing_res = URIRef(base_iri + citing) cited_res = URIRef(base_iri + cited) cur_time = datetime.fromtimestamp( int(timestamp)).strftime('%Y-%m-%dT%H:%M:%S') mod_date = str(rf.retrieve_modification_date(ref_res)) if cur_time == mod_date: # It didn't exist before cur_dir_path, cur_file_path = s.dir_and_file_paths( g_remove_br, base_dir, base_iri) cur_g = s.load(cur_file_path) for s, p, o in cur_g.triples((cited_res, None, None)): if p != RDF.type or o != GraphEntity.expression: g_remove_br.add(s, p, o) else: # It exists already new_cited = URIRef( str(cp.graph_set().add_br(cp.name, doi_curator, bcite_base_iri))) gen_prov_and_store_data(cp, rf, timestamp) g_remove_br.add((citing_res, GraphEntity.cites, cited_res)) g_remove_be.add((ref_res, GraphEntity.references, cited_res)) g_add_br.add((citing_res, GraphEntity.cites, new_cited)) g_add_be.add((ref_res, GraphEntity.references, new_cited)) s.update(g_add_be, g_remove_be, base_dir, base_iri, context_path, temp_dir_for_rdf_loading) s.update(g_add_br, g_remove_br, base_dir, base_iri, context_path, temp_dir_for_rdf_loading) s.update_all([g_add_br, g_add_be], [g_remove_br, g_remove_be], triplestore_url, base_dir) return timestamp, accept, citing, cited, quote(ref_res_text)
def setUp(self): cur_g = Graph(identifier=base_iri + "br/") cur_g.add((URIRef(base_iri + "br/022201"), FOAF.maker, URIRef(base_iri + "ra/011101"))) self.g = cur_g self.s = Storer(None, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir)
def __init__(self, tp_url_real, context_path, context_file_path, base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None): self.tp_url = triplestore_url self.base_iri = base_iri self.base_dir = base_dir self.info_dir = info_dir self.context_path = context_path self.dataset_home = URIRef(dataset_home) self.tmp_dir = tmp_dir self.tp_res = URIRef(tp_url_real) self.repok = Reporter(prefix="[DatasetHandler: INFO] ") self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ") self.st = Storer(context_map={context_path: context_file_path}, repok=self.repok, reperr=self.reperr) self.st.set_preface_query( u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" % (str(DatasetHandler.modified), str(DatasetHandler.dataset), str(DatasetHandler.modified)))
def gen_prov_and_store_data(cp, rf, timestamp): prov = ProvSet(cp.graph_set(), base_iri, context_path, default_dir, full_info_dir, rf, dir_split_number, items_per_file, "") prov.generate_provenance(int(float(timestamp))) # Store all the data res_storer = Storer(cp.graph_set(), context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) prov_storer = Storer(prov, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file) res_storer.upload_and_store(base_dir, triplestore_url, base_iri, context_path, temp_dir_for_rdf_loading) prov_storer.upload_and_store(base_dir, triplestore_url, base_iri, context_path, temp_dir_for_rdf_loading)
def store_all(gs): prov = ProvSet( gs, base_iri, context_path, default_dir, full_info_dir, ResourceFinder(base_dir=base_dir, base_iri=base_iri, tmp_dir=temp_dir_for_rdf_loading, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir), dir_split_number, items_per_file, "") # Prefix set to "" so as to avoid it for prov data prov.generate_provenance() print("Store the data for %s entities." % str(entity_count)) res_storer = Storer(gs, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) prov_storer = Storer(prov, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) res_storer.store_all(base_dir, base_iri, context_path, temp_dir_for_rdf_loading) prov_storer.store_all(base_dir, base_iri, context_path, temp_dir_for_rdf_loading) print("Update the dataset description.") dset_handler = DatasetHandler(triplestore_url_real, context_path, context_file_path, base_iri, base_dir, full_info_dir, dataset_home, temp_dir_for_rdf_loading) dset_handler.update_dataset_info(gs)
def update_all(g_set, remove_entity, full_info_dir): prov = ProvSet( g_set, base_iri, context_path, default_dir, full_info_dir, ResourceFinder(base_dir=base_dir, base_iri=base_iri, tmp_dir=temp_dir_for_rdf_loading, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir), dir_split_number, items_per_file, "") prov.generate_provenance(do_insert=False, remove_entity=remove_entity) res_storer = Storer(g_set, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) prov_storer = Storer(prov, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) res_storer.store_all(base_dir, base_iri, context_path, temp_dir_for_rdf_loading, remove_data=True) prov_storer.store_all(base_dir, base_iri, context_path, temp_dir_for_rdf_loading) dset_handler = DatasetHandler(triplestore_url_real, context_path, context_file_path, base_iri, base_dir, "", dataset_home, temp_dir_for_rdf_loading) dset_handler.update_dataset_info(g_set)
class StorerTest(unittest.TestCase): def setUp(self): cur_g = Graph(identifier=base_iri + "br/") cur_g.add((URIRef(base_iri + "br/022201"), FOAF.maker, URIRef(base_iri + "ra/011101"))) self.g = cur_g self.s = Storer(None, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) def test_store(self): result = self.s.store(self.g, base_dir, base_iri, context_path, temp_dir_for_rdf_loading, store_now=False) print( list(result.keys())[0], list(result.values())[0].serialize(format="nquads"))
class ResourceFinder(object): def __init__(self, g_set=None, ts_url=None, base_dir=None, base_iri=None, default_dir="_", tmp_dir=None, context_map={}, dir_split=0, n_file_item=1): self.g = Graph() self.base_dir = base_dir self.base_iri = base_iri self.storer = Storer(context_map=context_map) self.tmp_dir = tmp_dir self.dir_split = dir_split self.n_file_item = n_file_item self.name = "SPACIN " + self.__class__.__name__ self.loaded = set() self.default_dir = default_dir if g_set is not None: self.update_graph_set(g_set) if ts_url is None: self.ts = None else: self.ts = ConjunctiveGraph('SPARQLUpdateStore') self.ts.open((ts_url, ts_url)) def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None): if self.base_dir is not None and self.base_iri is not None: cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri, self.default_dir, self.dir_split, self.n_file_item)[1] if cur_file_path.endswith("index.json"): cur_path = cur_file_path.replace("index.json", "") + "prov" else: cur_path = cur_file_path[:-5] + os.sep + "prov" file_list = [] if os.path.isdir(cur_path): for cur_dir, cur_subdir, cur_files in os.walk(cur_path): for cur_file in cur_files: if cur_file.endswith(".json") and \ (prov_entity_type is None or cur_file.startswith(prov_entity_type)): file_list += [cur_dir + os.sep + cur_file] for file_path in file_list: if file_path not in self.loaded: self.loaded.add(file_path) cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir) self.add_triples_in_graph(cur_g) def add_triples_in_graph(self, g): if g is not None: for s, p, o in g.triples((None, None, None)): self.g.add((s, p, o)) def update_graph_set(self, g_set): for g in g_set.graphs(): self.add_triples_in_graph(g) def retrieve(self, id_dict): for id_type in id_dict: for id_string in id_dict[id_type]: res = self.__id_with_type(id_string, id_type) if res is not None: return res def retrieve_provenance_agent_from_name(self, string): query = """ SELECT DISTINCT ?pa WHERE { ?pa a <%s> ; <%s> "%s" } LIMIT 1 """ % (ProvEntity.prov_agent, GraphEntity.name, string) return self.__query(query) def retrieve_reference(self, citing_res, cited_res): query = """ SELECT DISTINCT ?res WHERE { <%s> <%s> ?res . ?res <%s> <%s> }""" % (citing_res, GraphEntity.contains_reference, GraphEntity.references, cited_res) return self.__query(query) def retrieve_reference_text(self, ref_res): query = """ SELECT DISTINCT ?res WHERE { <%s> <%s> ?res }""" % (ref_res, GraphEntity.has_content) return self.__query(query) def retrieve_from_orcid(self, string): return self.__id_with_type(string, GraphEntity.orcid) def retrieve_modification_date(self, res_iri): query = """ SELECT DISTINCT ?res WHERE { <%s> ^<%s> ?snapshot . FILTER NOT EXISTS { ?snapshop <%s> ?inv_date } ?snapshop <%s> ?res }""" % (res_iri, ProvEntity.specialization_of, ProvEntity.invalidated_at_time, ProvEntity.generated_at_time) return self.__query(query) def retrieve_entity(self, string, type): query = """ SELECT DISTINCT ?res WHERE { BIND(iri("%s") as ?res) . ?res a <%s> }""" % (string, str(type)) return self.__query(query) def retrieve_citing_from_doi(self, string): return self.__id_with_type(string.lower(), GraphEntity.doi, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_pmid(self, string): return self.__id_with_type(string, GraphEntity.pmid, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_pmcid(self, string): return self.__id_with_type(string, GraphEntity.pmcid, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_url(self, string): return self.__id_with_type(string.lower(), GraphEntity.url, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_from_doi(self, string): return self.__id_with_type(string.lower(), GraphEntity.doi) def retrieve_from_pmid(self, string): return self.__id_with_type(string, GraphEntity.pmid) def retrieve_from_pmcid(self, string): return self.__id_with_type(string, GraphEntity.pmcid) def retrieve_from_url(self, string): return self.__id_with_type(string.lower(), GraphEntity.url) def retrieve_from_issn(self, string): return self.__id_with_type(string, GraphEntity.issn) def retrieve_from_isbn(self, string): return self.__id_with_type(string, GraphEntity.isbn) def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id): if volume_id is None: return self.__retrieve_from_journal(id_dict, GraphEntity.journal_issue, issue_id) else: retrieved_volume = self.retrieve_volume_from_journal( id_dict, volume_id) if retrieved_volume is not None: query = """ SELECT DISTINCT ?br WHERE { ?br a <%s> ; <%s> <%s> ; <%s> "%s" } LIMIT 1 """ % (GraphEntity.journal_issue, GraphEntity.part_of, str(retrieved_volume), GraphEntity.has_sequence_identifier, issue_id) return self.__query(query) def retrieve_volume_from_journal(self, id_dict, volume_id): return self.__retrieve_from_journal(id_dict, GraphEntity.journal_volume, volume_id) def retrieve_url_string(self, res): return self.__retrieve_res_id_string(res, GraphEntity.url) def retrieve_doi_string(self, res): return self.__retrieve_res_id_string(res, GraphEntity.doi) def retrieve_pmid_string(self, res): return self.__retrieve_res_id_string(res, GraphEntity.pmid) def retrieve_pmcid_string(self, res): return self.__retrieve_res_id_string(res, GraphEntity.pmcid) def retrieve_br_url(self, res, string): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.url) def retrieve_br_doi(self, res, string): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.doi) def retrieve_br_pmid(self, res, string): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid) def retrieve_br_pmcid(self, res, string): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid) def retrieve_last_snapshot(self, prov_subj): query = """ SELECT DISTINCT ?se WHERE { ?se <%s> <%s> . FILTER NOT EXISTS {?se <%s> ?ca } } LIMIT 1 """ % (ProvEntity.specialization_of, str(prov_subj), ProvEntity.was_invalidated_by) return self.__query(query) def __retrieve_res_id_string(self, res, id_type): query = """ SELECT DISTINCT ?id WHERE { <%s> <%s> [ <%s> <%s> ; <%s> ?id ] }""" % (res, GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value) return self.__query(query) def __retrieve_res_id_by_type(self, res, id_string, id_type): if id_string is not None: query = """ SELECT DISTINCT ?id WHERE { <%s> <%s> ?id . ?id <%s> <%s> ; <%s> "%s" }""" % (res, GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string) return self.__query(query) def __retrieve_from_journal(self, id_dict, part_type, part_seq_id): for id_type in id_dict: for id_string in id_dict[id_type]: query = """ SELECT DISTINCT ?res WHERE { ?j <%s> ?id . ?id <%s> <%s> ; <%s> "%s" . ?res a <%s> ; <%s>+ ?j ; <%s> "%s" }""" % (GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string, part_type, GraphEntity.part_of, GraphEntity.has_sequence_identifier, part_seq_id) return self.__query(query) def __id_with_type(self, id_string, id_type, extras=""): query = """ SELECT DISTINCT ?res WHERE { ?res <%s> ?id . ?id <%s> <%s> ; <%s> "%s" . %s }""" % (GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string, extras) return self.__query(query) def __query(self, query): if self.ts is not None: result = self.ts.query(query) for res, in result: return res # If nothing has been returned, check if there is something # in the current graph set result = self.g.query(query) for res, in result: return res
def __init__(self, g_set=None, ts_url=None, base_dir=None, base_iri=None, default_dir="_", tmp_dir=None, context_map={}, dir_split=0, n_file_item=1): self.g = Graph() self.base_dir = base_dir self.base_iri = base_iri self.storer = Storer(context_map=context_map) self.tmp_dir = tmp_dir self.dir_split = dir_split self.n_file_item = n_file_item self.name = "SPACIN " + self.__class__.__name__ self.loaded = set() self.default_dir = default_dir self.index_for_graph_set = 0 #self.check = False if g_set is not None: self.update_graph_set(g_set) if ts_url is None: self.ts = None else: self.ts = ConjunctiveGraph('SPARQLUpdateStore') self.ts.open((ts_url, ts_url)) self.ts.namespace_manager.store.nsBindings = {} # This is to search eg.: for doi and get the res self.doi_store = {} self.orcid_store = {} self.pmid_store = {} self.pmcid_store = {} self.url_store = {} self.issn_store = {} self.isbn_store = {} self.crossref_store = {} # Used in __retrieve_res_id_string() when you query for the {res} and want to get ids literal values self.doi_store_type = {} self.orcid_store_type = {} self.pmid_store_type = {} self.pmcid_store_type = {} self.url_store_type = {} self.issn_store_type = {} self.isbn_store_type = {} self.crossref_store_type = {} # Used in __retrieve_res_id_by_type() when you query for the {res}_{id_literal} and # want to get id's URI, # # eg: calling # cur_id = self.rf.retrieve_br_url(cur_res.res, extracted_url) # in crossrefproc.py self.doi_store_type_id = {} self.orcid_store_type_id = {} self.pmid_store_type_id = {} self.pmcid_store_type_id = {} self.url_store_type_id = {} self.issn_store_type_id = {} self.isbn_store_type_id = {} self.crossref_store_type_id = {} # Used in __retrieve_from_journal() where you query for # {id_type}_{id_string}_{part_seq_id} and get the res # e.g. http://purl.org/spar/datacite/issn_1388-0209_58 # ISSN_1388-0209_volume_58 self.from_journal_volume = {} self.from_issue_partof_journal = {} # Caching blazegraph queries self.cache = {} self.cache_local = {}
class ResourceFinder(object): def __init__(self, g_set=None, ts_url=None, base_dir=None, base_iri=None, default_dir="_", tmp_dir=None, context_map={}, dir_split=0, n_file_item=1): self.g = Graph() self.base_dir = base_dir self.base_iri = base_iri self.storer = Storer(context_map=context_map) self.tmp_dir = tmp_dir self.dir_split = dir_split self.n_file_item = n_file_item self.name = "SPACIN " + self.__class__.__name__ self.loaded = set() self.default_dir = default_dir self.index_for_graph_set = 0 #self.check = False if g_set is not None: self.update_graph_set(g_set) if ts_url is None: self.ts = None else: self.ts = ConjunctiveGraph('SPARQLUpdateStore') self.ts.open((ts_url, ts_url)) self.ts.namespace_manager.store.nsBindings = {} # This is to search eg.: for doi and get the res self.doi_store = {} self.orcid_store = {} self.pmid_store = {} self.pmcid_store = {} self.url_store = {} self.issn_store = {} self.isbn_store = {} self.crossref_store = {} # Used in __retrieve_res_id_string() when you query for the {res} and want to get ids literal values self.doi_store_type = {} self.orcid_store_type = {} self.pmid_store_type = {} self.pmcid_store_type = {} self.url_store_type = {} self.issn_store_type = {} self.isbn_store_type = {} self.crossref_store_type = {} # Used in __retrieve_res_id_by_type() when you query for the {res}_{id_literal} and # want to get id's URI, # # eg: calling # cur_id = self.rf.retrieve_br_url(cur_res.res, extracted_url) # in crossrefproc.py self.doi_store_type_id = {} self.orcid_store_type_id = {} self.pmid_store_type_id = {} self.pmcid_store_type_id = {} self.url_store_type_id = {} self.issn_store_type_id = {} self.isbn_store_type_id = {} self.crossref_store_type_id = {} # Used in __retrieve_from_journal() where you query for # {id_type}_{id_string}_{part_seq_id} and get the res # e.g. http://purl.org/spar/datacite/issn_1388-0209_58 # ISSN_1388-0209_volume_58 self.from_journal_volume = {} self.from_issue_partof_journal = {} # Caching blazegraph queries self.cache = {} self.cache_local = {} def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None): if self.base_dir is not None and self.base_iri is not None: cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri, self.default_dir, self.dir_split, self.n_file_item)[1] if cur_file_path.endswith("index.json"): cur_path = cur_file_path.replace("index.json", "") + "prov" else: cur_path = cur_file_path[:-5] + os.sep + "prov" file_list = [] if os.path.isdir(cur_path): for cur_dir, cur_subdir, cur_files in os.walk(cur_path): for cur_file in cur_files: if (cur_file.endswith(".json") or cur_file.endswith(".ttl")) and \ (prov_entity_type is None or cur_file.startswith(prov_entity_type)): file_list += [cur_dir + os.sep + cur_file] for file_path in file_list: if file_path not in self.loaded: self.loaded.add(file_path) cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir) #self.add_triples_in_graph(cur_g) def add_triples_in_graph(self, g): return # This is deprecated if g is not None: for s, p, o in g.triples((None, None, None)): self.g.add((s, p, o)) def update_graph_set(self, g_set): return # This is deprecated for g in g_set.graphs()[self.index_for_graph_set:]: self.add_triples_in_graph(g) self.index_for_graph_set += 1 def retrieve(self, id_dict, typ='both'): for id_type in id_dict: for id_string in id_dict[id_type]: res = self.__id_with_type(id_string, id_type, typ=typ) if res is not None: return res def retrieve_from_orcid(self, string, typ='both'): return self.__id_with_type(string, GraphEntity.orcid, typ=typ) def retrieve_entity(self, string, typ='both'): query = """ SELECT DISTINCT ?res WHERE {{ BIND(iri("{}") as ?res) . ?res a <{}> }}""".format(string, str(type)) return self.__query(query, typ=typ) def retrieve_citing_from_doi(self, string, typ='only_blazegraph'): return self.__id_with_type(string.lower(), GraphEntity.doi, "?res <%s> ?cited" % GraphEntity.cites, typ) def retrieve_citing_from_pmid(self, string, typ='only_blazegraph'): return self.__id_with_type(string, GraphEntity.pmid, "?res <%s> ?cited" % GraphEntity.cites, typ) def retrieve_citing_from_pmcid(self, string, typ='only_blazegraph'): return self.__id_with_type(string, GraphEntity.pmcid, "?res <%s> ?cited" % GraphEntity.cites, typ) def retrieve_citing_from_url(self, string, typ='only_blazegraph'): return self.__id_with_type(string.lower(), GraphEntity.url, "?res <%s> ?cited" % GraphEntity.cites, typ) def retrieve_from_doi(self, string, typ='both'): return self.__id_with_type(string.lower(), GraphEntity.doi, typ=typ) def retrieve_from_pmid(self, string, typ='both'): return self.__id_with_type(string, GraphEntity.pmid, typ=typ) def retrieve_from_pmcid(self, string, typ='both'): return self.__id_with_type(string, GraphEntity.pmcid, typ=typ) def retrieve_from_url(self, string, typ='both'): return self.__id_with_type(string.lower(), GraphEntity.url, typ=typ) def retrieve_from_crossref(self, string, typ='both'): return self.__id_with_type(string, GraphEntity.crossref, typ=typ) def retrieve_from_issn(self, string, typ='both'): return self.__id_with_type(string, GraphEntity.issn, typ=typ) def retrieve_from_isbn(self, string, typ='both'): return self.__id_with_type(string, GraphEntity.isbn, typ=typ) def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id): retrieved_journal = self.retrieve(id_dict, 'both') if retrieved_journal is not None: cur_issue = self.from_issue_partof_journal.get( (retrieved_journal, volume_id, issue_id)) if cur_issue is None: if volume_id is None: query = """ SELECT DISTINCT ?br WHERE {{ ?br a <{}> ; <{}> <{}> ; <{}> "{}" }} LIMIT 1 """.format(GraphEntity.journal_issue, GraphEntity.part_of, retrieved_journal, GraphEntity.has_sequence_identifier, issue_id) else: query = """ SELECT DISTINCT ?br WHERE {{ ?br a <{}> ; <{}> [ a <{}> ; <{}> <{}> ; <{}> "{}" ] ; <{}> "{}" . }} LIMIT 1 """.format( GraphEntity.journal_issue, GraphEntity.part_of, GraphEntity.journal_volume, GraphEntity.part_of, retrieved_journal, GraphEntity.has_sequence_identifier, volume_id, GraphEntity.has_sequence_identifier, issue_id) return self.__query(query) else: return cur_issue def retrieve_volume_from_journal(self, id_dict, volume_id): retrieved_journal = self.retrieve(id_dict, 'both') if retrieved_journal is not None: cur_volume = self.from_journal_volume.get( (retrieved_journal, volume_id)) if cur_volume is None: query = """ SELECT DISTINCT ?br WHERE {{ ?br a <{}> ; <{}> <{}> ; <{}> "{}" }} LIMIT 1 """.format(GraphEntity.journal_volume, GraphEntity.part_of, retrieved_journal, GraphEntity.has_sequence_identifier, volume_id) return self.__query(query) else: return cur_volume def retrieve_url_string(self, res, typ): return self.__retrieve_res_id_string(res, GraphEntity.url, typ) def retrieve_doi_string(self, res, typ): return self.__retrieve_res_id_string(res, GraphEntity.doi, typ) def retrieve_pmid_string(self, res, typ): return self.__retrieve_res_id_string(res, GraphEntity.pmid, typ) def retrieve_pmcid_string(self, res, typ): return self.__retrieve_res_id_string(res, GraphEntity.pmcid, typ) def retrieve_br_url(self, res, string, typ): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.url, typ) def retrieve_br_doi(self, res, string, typ): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.doi, typ) def retrieve_br_pmid(self, res, string, typ): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid, typ) def retrieve_br_pmcid(self, res, string, typ): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid, typ) def retrieve_last_snapshot(self, prov_subj): query = ''' SELECT DISTINCT ?se WHERE {{ ?se <{}> <{}> . FILTER NOT EXISTS {{?se <{}> ?ca }} }} LIMIT 1 '''.format(ProvEntity.specialization_of, str(prov_subj), ProvEntity.was_invalidated_by) return self.__query(query) def __retrieve_res_id_string(self, input_res, id_type, typ): if id_type is not None and input_res is not None: if type(input_res) is GraphEntity: res = input_res.res else: res = URIRef(input_res) # First check if locally there's something if str(id_type) == 'http://purl.org/spar/datacite/url': store = self.url_store_type elif str(id_type) == 'http://purl.org/spar/datacite/doi': store = self.doi_store_type elif str(id_type) == 'http://purl.org/spar/datacite/orcid': store = self.orcid_store_type elif str(id_type) == 'http://purl.org/spar/datacite/pmid': store = self.pmid_store_type elif str(id_type) == 'http://purl.org/spar/datacite/pmcid': store = self.pmcid_store_type elif str(id_type) == 'http://purl.org/spar/datacite/issn': store = self.issn_store_type elif str(id_type) == 'http://purl.org/spar/datacite/isbn': store = self.isbn_store_type elif str(id_type) == 'http://purl.org/spar/datacite/crossref': store = self.crossref_store_type if str(id_type) == 'http://purl.org/spar/datacite/issn' or \ str(id_type) == 'http://purl.org/spar/datacite/isbn': if res in store: return store[res][0] elif res in store: return store[res] if typ != 'only_local': query = ''' SELECT DISTINCT ?id WHERE {{ <{}> <{}> [ <{}> <{}> ; <{}> ?id ] }}'''.format(res, GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value) return self.__query_blazegraph(query, typ) def __retrieve_res_id_by_type(self, input_res, id_string, id_type, typ): if type(input_res) is GraphEntity: res = input_res.res else: res = URIRef(input_res) # First check if locally there's something if id_type is not None and id is not None: if str(id_type) == 'http://purl.org/spar/datacite/url': store = self.url_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/doi': store = self.doi_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/orcid': store = self.orcid_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/pmid': store = self.pmid_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/pmcid': store = self.pmcid_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/issn': store = self.issn_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/isbn': store = self.isbn_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/crossref': store = self.crossref_store_type_id if (res, id_string) in store: return store[(res, id_string)] if id_string is not None and typ != 'only_local': query = ''' SELECT DISTINCT ?id WHERE {{ <{}> <{}> ?id . ?id <{}> <{}> ; <{}> "{}" }}'''.format(res, GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string) return self.__query_blazegraph(query) # TODO REMOVE def retrieve_res_id_by_type(self, input_res, id_string, id_type, typ): if type(input_res) is GraphEntity: res = input_res.res else: res = URIRef(input_res) # First check if locally there's something if id_type is not None and id is not None: if str(id_type) == 'http://purl.org/spar/datacite/url': store = self.url_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/doi': store = self.doi_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/orcid': store = self.orcid_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/pmid': store = self.pmid_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/pmcid': store = self.pmcid_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/issn': store = self.issn_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/isbn': store = self.isbn_store_type_id elif str(id_type) == 'http://purl.org/spar/datacite/crossref': store = self.crossref_store_type_id if (res, id_string) in store: return store[(res, id_string)] if id_string is not None and typ != 'only_local': query = ''' SELECT DISTINCT ?id WHERE {{ <{}> <{}> ?id . ?id <{}> <{}> ; <{}> "{}" }}'''.format(res, GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string) return self.__query_blazegraph(query) def add_id_to_store(self, input_res, input_id, extracted_id, store_type_id, store_type, store, is_list=False): if type(input_res) is GraphEntity: cur_res = input_res.res else: cur_res = URIRef(input_res) if type(input_id) is GraphEntity: cur_id = input_id.res else: cur_id = URIRef(input_id) if cur_res is not None and cur_id is not None and extracted_id is not None: # Check if local store doesn't contains already the elements # la seconda riga dell'if devo eliminarla if (cur_res, extracted_id) not in store_type_id \ and ((cur_res not in store_type and not is_list) or is_list) \ and extracted_id not in store: # Add it store_type_id[(cur_res, extracted_id)] = cur_id if is_list: cur_list = store_type.get(cur_res) if cur_list is None: cur_list = [extracted_id] store_type[cur_res] = cur_list if extracted_id not in cur_list: cur_list.append(extracted_id) else: store_type[cur_res] = extracted_id store[extracted_id] = cur_res def add_doi_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.doi_store_type_id, self.doi_store_type, self.doi_store) def add_url_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.url_store_type_id, self.url_store_type, self.url_store) def add_pmid_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.pmid_store_type_id, self.pmid_store_type, self.pmid_store) def add_pmcid_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.pmcid_store_type_id, self.pmcid_store_type, self.pmcid_store) def add_crossref_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.crossref_store_type_id, self.crossref_store_type, self.crossref_store) def add_orcid_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.orcid_store_type_id, self.orcid_store_type, self.orcid_store) def add_isbn_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.isbn_store_type_id, self.isbn_store_type, self.isbn_store, True) def add_issn_to_store(self, input_res, input_id, extracted_id): return self.add_id_to_store(input_res, input_id, extracted_id, self.issn_store_type_id, self.issn_store_type, self.issn_store, True) def add_issue_to_store(self, input_jou, volume, issue, input_id): if input_jou is not None and issue is not None and input_id is not None: if type(input_jou) is GraphEntity: jou_br = input_jou.res else: jou_br = URIRef(input_jou) if type(input_id) is GraphEntity: cur_id = input_id.res else: cur_id = URIRef(input_id) if (jou_br, volume, issue) not in self.from_issue_partof_journal: self.from_issue_partof_journal[(jou_br, volume, issue)] = cur_id def add_volume_to_store(self, input_jou, input_id, volume): if input_jou is not None and volume is not None and input_id is not None: if type(input_jou) is GraphEntity: jou_br = input_jou.res else: jou_br = URIRef(input_jou) if type(input_id) is GraphEntity: cur_id = input_id.res else: cur_id = URIRef(input_id) # Check if local store doesn't contains already the elements if (jou_br, volume) not in self.from_journal_volume: # Add it self.from_journal_volume[(jou_br, volume)] = cur_id def __id_with_type(self, id_string, id_type, extras="", typ='both'): """This method is called when we need to get the resource having a certain identifier. It first check locally if something has already been stored and then check on the blazegraph instance""" # First check if locally there's something if typ != 'only_blazegraph' and id_type is not None and id_string is not None: if str(id_type) == 'http://purl.org/spar/datacite/url': store = self.url_store elif str(id_type) == 'http://purl.org/spar/datacite/doi': store = self.doi_store elif str(id_type) == 'http://purl.org/spar/datacite/orcid': store = self.orcid_store elif str(id_type) == 'http://purl.org/spar/datacite/pmid': store = self.pmid_store elif str(id_type) == 'http://purl.org/spar/datacite/pmcid': store = self.pmcid_store elif str(id_type) == 'http://purl.org/spar/datacite/issn': store = self.issn_store elif str(id_type) == 'http://purl.org/spar/datacite/isbn': store = self.isbn_store elif str(id_type) == 'http://purl.org/spar/datacite/crossref': store = self.crossref_store if id_string in store: return store[id_string] # If nothing found, query blazegraph if typ != 'only_local': query = '''SELECT DISTINCT ?res WHERE {{ ?res <{}> ?id . ?id <{}> <{}> ; <{}> "{}" . {} }}'''.format(GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string, extras) return self.__query(query, typ=typ) def __query(self, query, typ='only_blazegraph'): if self.ts is not None and (typ == 'both' or typ == 'only_blazegraph'): res = self.__query_blazegraph(query) if res is not None: return res def __query_blazegraph(self, query, typ=None): if self.ts is not None: if self.cache.__contains__(query): result = self.cache[query] return result else: result = self.ts.query(query) for res, in result: self.cache[query] = res return res def __query_local(self, query): # Deprecated if self.cache_local.__contains__(query): result = self.cache_local[query] else: result = self.g.query(query) if result is not None and len(result): self.cache_local[query] = result for res, in result: return res
if found: break elif tentative == 5: print( "Process stopped at DOI '%s' due to exceptions" % cur_doi) exit(0) if found: to_remove[URIRef(base_iri + sub("^g(..):(.+)$", "\\1/\\2", br))] = \ [URIRef(iden) for iden in [base_iri + sub("^g(..):(.+)$", "\\1/\\2", r_id["r"]) for r_id in id_list]] s = Storer(context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) for full_info_dir in info_dirs: br_iri = [] br_files = {} id_files = {} update_br = GraphSet(base_iri, context_path) remove_id = GraphSet(base_iri, context_path) print("\n\nSupplier directory '%s'" % full_info_dir) to_remove = info_dirs[full_info_dir] br_counter = 0 for br in to_remove: if br_counter == 10: # Write everything on disk
class DatasetHandler(object): DCTERMS = Namespace("http://purl.org/dc/terms/") DCAT = Namespace("http://www.w3.org/ns/dcat#") VOID = Namespace("http://rdfs.org/ns/void#") MTT = Namespace("https://w3id.org/spar/mediatype/text/") DBR = Namespace("http://dbpedia.org/resource/") dataset = DCAT.Dataset datafile = DCAT.Distribution title = DCTERMS.title description = DCTERMS.description issued = DCTERMS.issued modified = DCTERMS.modified keyword = DCAT.keyword subject = DCAT.theme landing_page = DCAT.landingPage subset = VOID.subset sparql_endpoint = VOID.sparqlEndpoint distribution = DCAT.distribution license = DCTERMS.license download_url = DCAT.downloadURL media_type = DCAT.mediaType byte_size = DCAT.byte_size label = RDFS.label a = RDF.type turtle = MTT.turtle bibliographic_database = DBR.Bibliographic_database open_access = DBR.Open_access scholary_communication = DBR.Scholarly_communication citations = DBR.Citation def __init__(self, tp_url_real, context_path, context_file_path, base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None): self.tp_url = triplestore_url self.base_iri = base_iri self.base_dir = base_dir self.info_dir = info_dir self.context_path = context_path self.dataset_home = URIRef(dataset_home) self.tmp_dir = tmp_dir self.tp_res = URIRef(tp_url_real) self.repok = Reporter(prefix="[DatasetHandler: INFO] ") self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ") self.st = Storer(context_map={context_path: context_file_path}, repok=self.repok, reperr=self.reperr) self.st.set_preface_query( u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" % (str(DatasetHandler.modified), str(DatasetHandler.dataset), str(DatasetHandler.modified))) # /START Create Literal def create_label(self, g, res, string): return create_literal(g, res, RDFS.label, string) def create_publication_date(self, g, res, string): return create_literal(g, res, self.issued, string, XSD.dateTime) def update_modification_date(self, g, res, string): g.remove((res, self.modified, None)) return create_literal(g, res, self.modified, string, XSD.dateTime) def create_title(self, g, res, string): return create_literal(g, res, self.title, string) def create_description(self, g, res, string): return create_literal(g, res, self.description, string) def create_keyword(self, g, res, string): return create_literal(g, res, self.keyword, string) def create_byte_size(self, g, res, string): return create_literal(g, res, self.byte_size, string, XSD.decimal) # /END Create Literal # /START Create Complex Attributes def has_subject(self, g, res, obj): g.add((res, self.subject, obj)) def has_landing_page(self, g, res, obj): g.add((res, self.landing_page, obj)) def has_subset(self, g, res, obj): g.add((res, self.subset, obj)) def has_sparql_endpoint(self, g, res, obj): g.add((res, self.sparql_endpoint, obj)) def has_distribution(self, g, res, obj): g.add((res, self.distribution, obj)) def has_license(self, g, res, obj): g.add((res, self.license, obj)) def has_download_url(self, g, res, obj): g.add((res, self.download_url, obj)) def has_media_type(self, g, res, obj): g.add((res, self.media_type, obj)) # /END Create Complex Attributes # /START Types def dataset_type(self, g, res): create_type(g, res, self.dataset) def distribution_type(self, g, res): create_type(g, res, self.datafile) # /END Types def update_dataset_info(self, graph_set): cur_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') subgraphs_to_update = set() all_graphs = [] for g in graph_set.graphs(): cur_id = g.identifier if cur_id not in subgraphs_to_update: subgraphs_to_update.add(cur_id) cur_dataset_res = URIRef(cur_id) cur_dataset = self.get_dataset_graph(cur_dataset_res, cur_time) self.update_modification_date(cur_dataset, cur_dataset_res, cur_time) all_graphs += [cur_dataset] if subgraphs_to_update: cur_occ_res = URIRef(self.base_iri) cur_occ = self.get_dataset_graph(cur_occ_res, cur_time) self.update_modification_date(cur_occ, cur_occ_res, cur_time) for subgraph_id in subgraphs_to_update: self.has_subset(cur_occ, cur_occ_res, URIRef(subgraph_id)) all_graphs += [cur_occ] if all_graphs: # Store everything and upload to triplestore if self.tp_url is None: self.st.store_all( self.base_dir, self.base_iri, self.context_path, self.tmp_dir, all_graphs, True) else: self.st.upload_and_store( self.base_dir, self.tp_url, self.base_iri, self.context_path, self.tmp_dir, all_graphs, True) def get_dataset_graph(self, res, cur_time): dataset_path = self.get_metadata_path_from_resource(res) if os.path.exists(dataset_path): return list(self.st.load(dataset_path, tmp_dir=self.tmp_dir).contexts())[0] else: dataset_label = "ccc" dataset_title = "The Citations in Context Corpus" dataset_description = "The Citations in Context Corpus is an open repository of scholarly " \ "citation data made available under a Creative Commons public " \ "domain dedication, which provides in RDF accurate citation " \ "information (bibliographic references) harvested from the " \ "scholarly literature (described using the SPAR Ontologies) " \ "that others may freely build upon, enhance and reuse for any " \ "purpose, without restriction under copyright or database law." if re.search("/../$", str(res)) is not None: g = Graph(identifier=str(res)) dataset_short_name = str(res)[-3:-1] dataset_name = GraphSet.labels[dataset_short_name] dataset_title += ": %s dataset" % dataset_name.title() dataset_description += " This sub-dataset contains all the '%s' resources." % \ dataset_name dataset_label += " / %s" % dataset_short_name self.create_keyword(g, res, dataset_name) else: g = Graph() self.has_landing_page(g, res, self.dataset_home) self.has_sparql_endpoint(g, res, self.tp_res) self.dataset_type(g, res) self.create_label(g, res, dataset_label) self.create_title(g, res, dataset_title) self.create_description(g, res, dataset_description) self.create_publication_date(g, res, cur_time) self.create_keyword(g, res, "OCC") self.create_keyword(g, res, "ccc") self.create_keyword(g, res, "OpenCitations") self.create_keyword(g, res, "Citations in Context Corpus") self.create_keyword(g, res, "SPAR Ontologies") self.create_keyword(g, res, "bibliographic references") self.create_keyword(g, res, "citations") self.has_subject(g, res, self.bibliographic_database) self.has_subject(g, res, self.scholary_communication) self.has_subject(g, res, self.open_access) self.has_subject(g, res, self.citations) return g def get_metadata_path_from_resource(self, dataset_res): return self.get_metadata_path_from_iri(str(dataset_res)) def get_metadata_path_from_iri(self, dataset_iri): return re.sub("^%s" % self.base_iri, self.base_dir, dataset_iri) + "index.json"
base_iri=base_iri, tmp_dir=temp_dir_for_rdf_loading, context_map={ context_path: context_file_path }, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir), dir_split_number, items_per_file, supplier_prefix) prov.generate_provenance() res_storer = Storer(result, context_map={ context_path: context_file_path }, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) prov_storer = Storer( prov, context_map={ context_path: context_file_path }, dir_split=dir_split_number, n_file_item=items_per_file) if do_parallel: base_share_dir = sharing_dir + sep + real_dir + \ datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + sep