def __init__(self, graph_set=None, repok=None, reperr=None, context_map={}, default_dir="_", dir_split=0, n_file_item=1): self.dir_split = dir_split self.n_file_item = n_file_item self.context_map = context_map self.default_dir = default_dir for context_url in context_map: context_file_path = context_map[context_url] with open(context_file_path) as f: context_json = json.load(f) self.context_map[context_url] = context_json if graph_set is None: self.g = [] else: self.g = graph_set.graphs() if repok is None: self.repok = Reporter(prefix="[Storer: INFO] ") else: self.repok = repok if reperr is None: self.reperr = Reporter(prefix="[Storer: ERROR] ") else: self.reperr = reperr self.preface_query = ""
class ReferenceProcessor(object): def __init__(self, stored_file, reference_dir, error_dir, stopper, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; " "rv:33.0) Gecko/20100101 Firefox/33.0"}, sec_to_wait=10, max_iteration=6, timeout=30, debug=False, supplier_idx=()): self.headers = headers self.sec_to_wait = sec_to_wait self.max_iteration = max_iteration self.timeout = timeout self.stopper = stopper self.name = "BEE " + self.__class__.__name__ self.repok = Reporter(print_sentences=debug, prefix="[%s - INFO] " % self.name) self.repok.new_article() self.reper = Reporter(print_sentences=debug, prefix="[%s - ERROR] " % self.name) self.reper.new_article() self.rs = BibliographicReferenceStorer(stored_file, reference_dir, error_dir, supplier_idx) def process(self): pass # To implement in subclasses
def __init__(self, conf_file, sec_to_wait=10, max_iteration=6, timeout=30, query_interface='remote'): with open(conf_file) as f: conf_json = json.load(f) self.headers = { "Authorization": "Bearer %s" % conf_json["access_token"], "Content-Type": "application/json" } self.id = "ORCID" self.name = "SPACIN " + self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.__last_query_done = None self.sec_to_wait = sec_to_wait self.max_iteration = max_iteration self.timeout = timeout if query_interface == 'local': self.query_interface = LocalQuery(reperr=self.reper, repok=self.repok) elif query_interface == 'remote': self.query_interface = RemoteQuery(max_iteration=max_iteration, sec_to_wait=sec_to_wait, timeout=timeout, headers=self.headers, reperr=self.reper, repok=self.repok, is_json=True) else: raise ValueError( "query_interface param must be `local` or `remote`")
def __init__(self, base_iri, context_path, info_dir="", n_file_item=1, supplier_prefix="", forced_type=False, wanted_label=True): self.r_count = 0 # A list of rdflib.Graphs, one for subject entity self.g = [] # The following variable maps a URIRef with the graph in the graph list related to them self.entity_g = {} # The following variable maps a URIRef with the related graph entity self.res_to_entity = {} self.base_iri = base_iri self.context_path = context_path self.cur_name = "OCDM " + self.__class__.__name__ self.n_file_item = n_file_item self.supplier_prefix = supplier_prefix self.wanted_label = wanted_label ##new self.forced_type = forced_type ##new # Graphs # The following structure of URL is quite important for the other classes # developed and should not be changed. The only part that can change is the # value of the base_iri self.g_an = base_iri + "an/" # new self.g_ar = base_iri + "ar/" self.g_be = base_iri + "be/" self.g_br = base_iri + "br/" self.g_ci = base_iri + "ci/" # new self.g_de = base_iri + "de/" # new self.g_id = base_iri + "id/" self.g_pl = base_iri + "pl/" # new self.g_ra = base_iri + "ra/" self.g_re = base_iri + "re/" self.g_rp = base_iri + "rp/" # new # Local paths self.info_dir = info_dir self.an_info_path = info_dir + "an.txt" # new self.ar_info_path = info_dir + "ar.txt" self.be_info_path = info_dir + "be.txt" self.br_info_path = info_dir + "br.txt" self.ci_info_path = info_dir + "ci.txt" # new not really used self.de_info_path = info_dir + "de.txt" # new self.id_info_path = info_dir + "id.txt" self.pl_info_path = info_dir + "pl.txt" # new self.ra_info_path = info_dir + "ra.txt" self.re_info_path = info_dir + "re.txt" self.rp_info_path = info_dir + "rp.txt" # new self.reperr = Reporter(True) self.reperr.new_article() self.repok = Reporter(True) self.repok.new_article()
def __init__(self, base_iri, context_base, info_dir, entries, n_file_item, supplier_prefix, agent_id=None): self.occ = None self.doi = None self.pmid = None self.pmcid = None self.url = None self.curator = None self.source = None self.source_provider = None self.entries = None self.reference_pointers = None if entries is not None: if "occ" in entries: self.occ = entries["occ"] if "doi" in entries: self.doi = entries["doi"].lower() if "pmid" in entries: self.pmid = entries["pmid"] if "pmcid" in entries: self.pmcid = entries["pmcid"] if "url" in entries: self.url = entries["url"].lower() if "curator" in entries: self.curator = entries["curator"] if "source" in entries: self.source = entries["source"] if "source_provider" in entries: self.source_provider = entries["source_provider"] if "references" in entries: self.entries = entries["references"] if "reference_pointers" in entries: self.reference_pointers = entries["reference_pointers"] self.name = "SPACIN " + self.__class__.__name__ self.g_set = GraphSet(base_iri, context_base, info_dir, n_file_item, supplier_prefix, wanted_label=False) # added no label param self.id = agent_id self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.repok.new_article() self.reperr = Reporter(prefix="[%s - ERROR] " % self.name) self.reperr.new_article()
def __init__(self, conf_file, sec_to_wait=10, max_iteration=6, timeout=30): with open(conf_file) as f: conf_json = json.load(f) self.headers = { "Authorization": "Bearer %s" % conf_json["access_token"], "Content-Type": "application/json" } self.id = "ORCID" self.name = "SPACIN " + self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.__last_query_done = None self.sec_to_wait = sec_to_wait self.max_iteration = max_iteration self.timeout = timeout
def __init__(self, tp_url_real, context_path, context_file_path, base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None): self.tp_url = triplestore_url self.base_iri = base_iri self.base_dir = base_dir self.info_dir = info_dir self.context_path = context_path self.dataset_home = URIRef(dataset_home) self.tmp_dir = tmp_dir self.tp_res = URIRef(tp_url_real) self.repok = Reporter(prefix="[DatasetHandler: INFO] ") self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ") self.st = Storer(context_map={context_path: context_file_path}, repok=self.repok, reperr=self.reperr) self.st.set_preface_query( u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" % (str(DatasetHandler.modified), str(DatasetHandler.dataset), str(DatasetHandler.modified)))
def __init__(self, base_iri, context_path, info_dir="", n_file_item=1, supplier_prefix=""): self.r_count = 0 # A list of rdflib.Graphs, one for subject entity self.g = [] # The following variable maps a URIRef with the graph in the graph list related to them self.entity_g = {} # The following variable maps a URIRef with the related graph entity self.res_to_entity = {} self.base_iri = base_iri self.context_path = context_path self.cur_name = "OCDM " + self.__class__.__name__ self.n_file_item = n_file_item self.supplier_prefix = supplier_prefix # Graphs # The following structure of URL is quite important for the other classes # developed and should not be changed. The only part that can change is the # value of the base_iri self.g_ar = base_iri + "ar/" self.g_be = base_iri + "be/" self.g_br = base_iri + "br/" self.g_id = base_iri + "id/" self.g_ra = base_iri + "ra/" self.g_re = base_iri + "re/" # Local paths self.info_dir = info_dir self.ar_info_path = info_dir + "ar.txt" self.be_info_path = info_dir + "be.txt" self.br_info_path = info_dir + "br.txt" self.id_info_path = info_dir + "id.txt" self.ra_info_path = info_dir + "ra.txt" self.re_info_path = info_dir + "re.txt" self.reperr = Reporter(True) self.reperr.new_article() self.repok = Reporter(True) self.repok.new_article()
class GraphSet(object): # Labels labels = { "an": "annotation", # new "ar": "agent role", "be": "bibliographic entry", "br": "bibliographic resource", "ci": "citation", # new "de": "discourse element", # new "id": "identifier", "pl": "single location pointer list", # new "ra": "responsible agent", "re": "resource embodiment", "rp": "in-text reference pointer" # new } def __init__(self, base_iri, context_path, info_dir="", n_file_item=1, supplier_prefix="", forced_type=False, wanted_label=True): self.r_count = 0 # A list of rdflib.Graphs, one for subject entity self.g = [] # The following variable maps a URIRef with the graph in the graph list related to them self.entity_g = {} # The following variable maps a URIRef with the related graph entity self.res_to_entity = {} self.base_iri = base_iri self.context_path = context_path self.cur_name = "OCDM " + self.__class__.__name__ self.n_file_item = n_file_item self.supplier_prefix = supplier_prefix self.wanted_label = wanted_label ##new self.forced_type = forced_type ##new # Graphs # The following structure of URL is quite important for the other classes # developed and should not be changed. The only part that can change is the # value of the base_iri self.g_an = base_iri + "an/" # new self.g_ar = base_iri + "ar/" self.g_be = base_iri + "be/" self.g_br = base_iri + "br/" self.g_ci = base_iri + "ci/" # new self.g_de = base_iri + "de/" # new self.g_id = base_iri + "id/" self.g_pl = base_iri + "pl/" # new self.g_ra = base_iri + "ra/" self.g_re = base_iri + "re/" self.g_rp = base_iri + "rp/" # new # Local paths self.info_dir = info_dir self.an_info_path = info_dir + "an.txt" # new self.ar_info_path = info_dir + "ar.txt" self.be_info_path = info_dir + "be.txt" self.br_info_path = info_dir + "br.txt" self.ci_info_path = info_dir + "ci.txt" # new not really used self.de_info_path = info_dir + "de.txt" # new self.id_info_path = info_dir + "id.txt" self.pl_info_path = info_dir + "pl.txt" # new self.ra_info_path = info_dir + "ra.txt" self.re_info_path = info_dir + "re.txt" self.rp_info_path = info_dir + "rp.txt" # new self.reperr = Reporter(True) self.reperr.new_article() self.repok = Reporter(True) self.repok.new_article() def res_count(self): # useless? return self.r_count def get_entity(self, res): if res in self.res_to_entity: return self.res_to_entity[res] # Add resources related to bibliographic entities def add_an(self, resp_agent, source_agent=None, source=None, res=None): # new return self._add(self.g_an, GraphEntity.note, res, resp_agent, source_agent, source, self.an_info_path, "an") def add_ar(self, resp_agent, source_agent=None, source=None, res=None): return self._add(self.g_ar, GraphEntity.role_in_time, res, resp_agent, source_agent, source, self.ar_info_path, "ar") def add_be(self, resp_agent, source_agent=None, source=None, res=None): return self._add(self.g_be, GraphEntity.bibliographic_reference, res, resp_agent, source_agent, source, self.be_info_path, "be") def add_br(self, resp_agent, source_agent=None, source=None, res=None): return self._add(self.g_br, GraphEntity.expression, res, resp_agent, source_agent, source, self.br_info_path, "br") # def add_ci(self, resp_agent, citing_res, cited_res, rp_num=None, source_agent=None, source=None, res=None): # new # return self._add_ci(self.g_ci, GraphEntity.citation, citing_res, cited_res, rp_num, res, resp_agent, # source_agent, source, self.ci_info_path, "ci") def add_ci(self, resp_agent, source_agent=None, source=None, res=None): # new return self._add(self.g_ci, GraphEntity.citation, res, resp_agent, source_agent, source, self.ci_info_path, "ci") def add_de(self, resp_agent, source_agent=None, source=None, res=None): # new return self._add(self.g_de, GraphEntity.discourse_element, res, resp_agent, source_agent, source, self.de_info_path, "de") def add_id(self, resp_agent, source_agent=None, source=None, res=None): return self._add(self.g_id, GraphEntity.identifier, res, resp_agent, source_agent, source, self.id_info_path, "id") def add_pl(self, resp_agent, source_agent=None, source=None, res=None): # new return self._add(self.g_pl, GraphEntity.singleloc_pointer_list, res, resp_agent, source_agent, source, self.pl_info_path, "pl") def add_rp(self, resp_agent, source_agent=None, source=None, res=None): # new return self._add(self.g_rp, GraphEntity.intextref_pointer, res, resp_agent, source_agent, source, self.rp_info_path, "rp") def add_ra(self, resp_agent, source_agent=None, source=None, res=None): return self._add(self.g_ra, GraphEntity.agent, res, resp_agent, source_agent, source, self.ra_info_path, "ra") def add_re(self, resp_agent, source_agent=None, source=None, res=None): return self._add(self.g_re, GraphEntity.manifestation, res, resp_agent, source_agent, source, self.re_info_path, "re") # new def _add_ci(self, graph_url, main_type, citing_res, cited_res, rp_num, res, resp_agent, source_agent, source, info_file_path, short_name, list_of_entities=[]): cur_g = Graph(identifier=graph_url) self._set_ns(cur_g) self.g += [cur_g] if res is not None: return self._generate_entity(cur_g, res=res, resp_agent=resp_agent, source_agent=source_agent, source=source, list_of_entities=list_of_entities) else: citing_res, cited_res = str(citing_res), str(cited_res) citing_count = citing_res.rsplit('/', 1)[-1] cited_count = cited_res.rsplit('/', 1)[-1] if rp_num is not None: count = citing_count + '-' + cited_count + '/' + rp_num else: count = citing_count + '-' + cited_count return self._generate_entity(cur_g, res_type=main_type, resp_agent=resp_agent, source_agent=source_agent, source=source, count=count, label=None, short_name=short_name, list_of_entities=list_of_entities) def _add(self, graph_url, main_type, res, resp_agent, source_agent, source, info_file_path, short_name, list_of_entities=[]): cur_g = Graph(identifier=graph_url) self._set_ns(cur_g) self.g += [cur_g] # This is the case when 'res_or_resp_agent' is a resource. It allows one to create # the graph entity starting from and existing URIRef, without incrementing anything # at the graph set level. However, a new graph is created and reserved for such resource # and it is added to the graph set. if res is not None: return self._generate_entity(cur_g, res=res, res_type=main_type, resp_agent=resp_agent, source_agent=source_agent, source=source, list_of_entities=list_of_entities, forced_type=self.forced_type) # This is the case when 'res_or_resp_agent' is actually a string representing the name # of the responsible agent. In this case, a new individual will be created. else: self._increment() related_to_label = "" related_to_short_label = "" # Note: even if list of entities is actually a list, it seems # that it would be composed by at most one item (e.g. for provenance) if list_of_entities: count = str( GraphSet._add_number( info_file_path, find_local_line_id(list_of_entities[0], self.n_file_item))) related_to_label += " related to" related_to_short_label += " ->" for idx, cur_entity in enumerate(list_of_entities): if idx > 0: related_to_label += "," related_to_short_label += "," cur_short_name = get_short_name(cur_entity) cur_entity_count = get_count(cur_entity) cur_entity_prefix = get_prefix(cur_entity) if cur_short_name == 'ci': related_to_label += " %s %s" % ( self.labels[cur_short_name], cur_entity_count) related_to_short_label += " %s/%s" % (cur_short_name, cur_entity_count) else: related_to_label += " %s %s%s" % ( self.labels[cur_short_name], cur_entity_prefix, cur_entity_count) related_to_short_label += " %s/%s%s" % ( cur_short_name, cur_entity_prefix, cur_entity_count) else: count = self.supplier_prefix + str( GraphSet._add_number(info_file_path)) if self.wanted_label: ##new label = "%s %s%s [%s/%s%s]" % ( GraphSet.labels[short_name], count, related_to_label, short_name, count, related_to_short_label) else: label = None return self._generate_entity(cur_g, res_type=main_type, resp_agent=resp_agent, source_agent=source_agent, source=source, count=count, label=label, short_name=short_name, list_of_entities=list_of_entities, forced_type=self.forced_type) def _generate_entity(self, g, res=None, res_type=None, resp_agent=None, source_agent=None, source=None, count=None, label=None, short_name="", list_of_entities=[], forced_type=False): return GraphEntity(g, res=res, res_type=res_type, resp_agent=resp_agent, source_agent=source_agent, source=source, count=count, label=label, g_set=self, forced_type=forced_type) def graphs(self): result = [] for cur_g in self.g: if len(cur_g) > 0: result += [cur_g] return result def _increment(self): self.r_count += 1 def _set_ns(self, g): g.namespace_manager.bind("an", Namespace(self.g_an)) # new g.namespace_manager.bind("ar", Namespace(self.g_ar)) g.namespace_manager.bind("be", Namespace(self.g_be)) g.namespace_manager.bind("ci", Namespace(self.g_ci)) # new g.namespace_manager.bind("de", Namespace(self.g_de)) # new g.namespace_manager.bind("br", Namespace(self.g_br)) g.namespace_manager.bind("id", Namespace(self.g_id)) g.namespace_manager.bind("pl", Namespace(self.g_pl)) # new g.namespace_manager.bind("ra", Namespace(self.g_ra)) g.namespace_manager.bind("re", Namespace(self.g_re)) g.namespace_manager.bind("rp", Namespace(self.g_rp)) # new g.namespace_manager.bind("biro", GraphEntity.BIRO) g.namespace_manager.bind("co", GraphEntity.CO) # new g.namespace_manager.bind("c4o", GraphEntity.C4O) g.namespace_manager.bind("cito", GraphEntity.CITO) g.namespace_manager.bind("datacite", GraphEntity.DATACITE) g.namespace_manager.bind("dcterms", GraphEntity.DCTERMS) g.namespace_manager.bind("deo", GraphEntity.DEO) # new g.namespace_manager.bind("doco", GraphEntity.DOCO) g.namespace_manager.bind("fabio", GraphEntity.FABIO) g.namespace_manager.bind("foaf", GraphEntity.FOAF) g.namespace_manager.bind("frbr", GraphEntity.FRBR) g.namespace_manager.bind("literal", GraphEntity.LITERAL) g.namespace_manager.bind("oa", GraphEntity.OA) g.namespace_manager.bind("oco", GraphEntity.OCO) g.namespace_manager.bind("prism", GraphEntity.PRISM) g.namespace_manager.bind("pro", GraphEntity.PRO) @staticmethod def get_graph_iri(g): return str(g.identifier) @staticmethod def _read_number(file_path, line_number=1): cur_number = 0 try: with open(file_path) as f: cur_number = int(f.readlines()[line_number - 1]) except Exception as e: pass # Do nothing return cur_number @staticmethod def _add_number(file_path, line_number=1): cur_number = GraphSet._read_number(file_path, line_number) + 1 if not os.path.exists(os.path.dirname(file_path)): os.makedirs(os.path.dirname(file_path)) if os.path.exists(file_path): with open(file_path) as f: all_lines = f.readlines() else: all_lines = [] line_len = len(all_lines) zero_line_number = line_number - 1 for i in range(line_number): if i >= line_len: all_lines += ["\n"] if i == zero_line_number: all_lines[i] = str(cur_number) + "\n" with open(file_path, "w") as f: f.writelines(all_lines) return cur_number
class Storer(object): def __init__(self, graph_set=None, repok=None, reperr=None, context_map={}, default_dir="_", dir_split=0, n_file_item=1, nt=False, nq=False): self.nt = nt self.nq = nq self.dir_split = dir_split self.n_file_item = n_file_item self.default_dir = default_dir if not nt and not nq: self.context_map = context_map for context_url in context_map: context_file_path = context_map[context_url] with open(context_file_path) as f: context_json = json.load(f) self.context_map[context_url] = context_json if graph_set is None: self.g = [] else: self.g = graph_set.graphs() if repok is None: self.repok = Reporter(prefix="[Storer: INFO] ") else: self.repok = repok if reperr is None: self.reperr = Reporter(prefix="[Storer: ERROR] ") else: self.reperr = reperr self.preface_query = "" @staticmethod def hack_dates(): if XSD.gYear in _toPythonMapping: _toPythonMapping.pop(XSD.gYear) if XSD.gYearMonth in _toPythonMapping: _toPythonMapping.pop(XSD.gYearMonth) def store_graphs_in_file(self, file_path, context_path): self.repok.new_article() self.reperr.new_article() self.repok.add_sentence( "Store the graphs into a file: starting process") cg = ConjunctiveGraph() for g in self.g: cg.addN([item + (g.identifier, ) for item in list(g)]) self.__store_in_file(cg, file_path, context_path) def store_all(self, base_dir, base_iri, context_path, tmp_dir=None, g_set=[], override=False, remove_data=False): for g in g_set: self.g += [g] self.repok.new_article() self.reperr.new_article() self.repok.add_sentence("Starting the process") processed_graphs = {} for cur_g in self.g: processed_graphs = self.store(cur_g, base_dir, base_iri, context_path, tmp_dir, override, processed_graphs, False, remove_data) stored_graph_path = [] for cur_file_path in processed_graphs: stored_graph_path += [cur_file_path] self.__store_in_file(processed_graphs[cur_file_path], cur_file_path, context_path) return stored_graph_path def upload_and_store(self, base_dir, triplestore_url, base_iri, context_path, tmp_dir=None, g_set=[], override=False): stored_graph_path = self.store_all(base_dir, base_iri, context_path, tmp_dir, g_set, override) # Some graphs were not stored properly, then no one will be updloaded to the triplestore # but we highlights those ones that could be added in principle, by mentioning them # with a ".notupdloaded" marker if None in stored_graph_path: for file_path in stored_graph_path: # Create a marker for the file not uploaded in the triplestore open("%s.notuploaded" % file_path, "w").close() self.reperr.add_sentence( "[6] " "The statements of in the JSON-LD file '%s' were not " "uploaded into the triplestore." % file_path) else: # All the files have been stored self.upload_all(self.g, triplestore_url, base_dir) def query(self, query_string, triplestore_url, n_statements=None, base_dir=None): if query_string != "": try: tp = SPARQLWrapper(triplestore_url) tp.setMethod('POST') tp.setQuery(query_string) tp.query() if n_statements is None: self.repok.add_sentence( "Triplestore updated by means of a SPARQL Update query." ) else: self.repok.add_sentence( "Triplestore updated with %s more RDF statements." % n_statements) return True except Exception as e: self.reperr.add_sentence( "[1] " "Graph was not loaded into the " "triplestore due to communication problems: %s" % str(e)) if base_dir is not None: tp_err_dir = base_dir + os.sep + "tp_err" if not os.path.exists(tp_err_dir): os.makedirs(tp_err_dir) cur_file_err = tp_err_dir + os.sep + \ datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f_not_uploaded.txt') with io.open(cur_file_err, "w", encoding="utf-8") as f: f.write(query_string) return False def do_action_all(self, all_g, triplestore_url, base_dir, query_f): result = True self.repok.new_article() self.reperr.new_article() query_string = None total_new_statements = None for idx, cur_g in enumerate(all_g): cur_idx = idx % 10 if cur_idx == 0: if query_string is not None: result &= self.query(query_string, triplestore_url, total_new_statements, base_dir) query_string = "" total_new_statements = 0 else: query_string += " ; " total_new_statements += len(cur_g) query_string += self.get_preface_query(cur_g) + query_f(cur_g) if query_string is not None and query_string != "": result &= self.query(query_string, triplestore_url, total_new_statements, base_dir) return result def update_all(self, all_add_g, all_remove_g, triplestore_url, base_dir): return self.do_action_all(all_remove_g, triplestore_url, base_dir, Storer._make_delete_query) and \ self.upload_all(all_add_g, triplestore_url, base_dir) def upload_all(self, all_g, triplestore_url, base_dir): return self.do_action_all(all_g, triplestore_url, base_dir, Storer._make_insert_query) def execute_upload_query(self, query_string, triplestore_url): self.repok.new_article() self.reperr.new_article() return self.query(query_string, triplestore_url) def upload(self, cur_g, triplestore_url): self.repok.new_article() self.reperr.new_article() query_string = Storer._make_insert_query(cur_g) return self.query(query_string, triplestore_url, len(cur_g)) def set_preface_query(self, query_string): self.preface_query = query_string def get_preface_query(self, cur_g): if self.preface_query != "": if type(cur_g.identifier) is BNode: return "CLEAR DEFAULT ; " else: return "WITH <%s> " % str( cur_g.identifier) + self.preface_query + " ; " else: return "" @staticmethod def _make_insert_query(cur_g): return Storer.__make_query(cur_g, "INSERT") @staticmethod def _make_delete_query(cur_g): return Storer.__make_query(cur_g, "DELETE") @staticmethod def __make_query(cur_g, query_type="INSERT"): if type(cur_g.identifier) is BNode: return "%s DATA { %s }" % ( query_type, cur_g.serialize(format="nt11", encoding="utf-8").decode("utf-8")) else: return "%s DATA { GRAPH <%s> { %s } }" % \ (query_type, str(cur_g.identifier), cur_g.serialize(format="nt11", encoding="utf-8").decode("utf-8")) def __store_in_file(self, cur_g, cur_file_path, context_path): # Note: the following lines from here and until 'cur_json_ld' are a sort of hack for including all # the triples of the input graph into the final stored file. Some how, some of them are not written # in such file otherwise - in particular the provenance ones. new_g = ConjunctiveGraph() for s, p, o in cur_g.triples((None, None, None)): g_iri = None for g_context in cur_g.contexts((s, p, o)): g_iri = g_context.identifier break new_g.addN([(s, p, o, g_iri)]) if not self.nt and not self.nq and context_path: cur_json_ld = json.loads( new_g.serialize( format="json-ld", context=self.__get_context(context_path)).decode("utf-8")) if isinstance(cur_json_ld, dict): cur_json_ld["@context"] = context_path else: # it is a list for item in cur_json_ld: item["@context"] = context_path with open(cur_file_path, "w") as f: json.dump(cur_json_ld, f, indent=4, ensure_ascii=False) elif self.nt: new_g.serialize(cur_file_path, format="nt11", encoding="utf-8") elif self.nq: new_g.serialize(cur_file_path, format="nquads", encoding="utf-8") self.repok.add_sentence("File '%s' added." % cur_file_path) def dir_and_file_paths(self, cur_g, base_dir, base_iri): cur_subject = set(cur_g.subjects(None, None)).pop() if self.nt or self.nq: is_json = False else: is_json = True return find_paths(str(cur_subject), base_dir, base_iri, self.default_dir, self.dir_split, self.n_file_item, is_json=is_json) def update(self, add_g, remove_g, base_dir, base_iri, context_path, tmp_dir=None, override=False, already_processed={}, store_now=True): self.repok.new_article() self.reperr.new_article() if len(remove_g) > 0: cur_dir_path, cur_file_path = self.dir_and_file_paths( remove_g, base_dir, base_iri) if cur_file_path in already_processed: final_g = already_processed[cur_file_path] elif os.path.exists(cur_file_path): # This is a conjunctive graps that contains all the triples (and graphs) # the file is actually defining - they could be more than those using # 'cur_subject' as subject. final_g = self.load(cur_file_path, tmp_dir=tmp_dir) already_processed[cur_file_path] = final_g for s, p, o, g in [ item + (remove_g.identifier, ) for item in list(remove_g) ]: final_g.remove((s, p, o, g)) if len(add_g) > 0: self.store(add_g, base_dir, base_iri, context_path, tmp_dir, override, already_processed, store_now) elif len(remove_g) > 0 and store_now: self.__store_in_file(final_g, cur_file_path, context_path) return already_processed def store(self, cur_g, base_dir, base_iri, context_path, tmp_dir=None, override=False, already_processed={}, store_now=True, remove_data=False): self.repok.new_article() self.reperr.new_article() if len(cur_g) > 0: cur_dir_path, cur_file_path = self.dir_and_file_paths( cur_g, base_dir, base_iri) try: if not os.path.exists(cur_dir_path): os.makedirs(cur_dir_path) final_g = ConjunctiveGraph() final_g.addN( [item + (cur_g.identifier, ) for item in list(cur_g)]) # Remove the data if remove_data: stored_g = None if cur_file_path in already_processed: stored_g = already_processed[cur_file_path] elif os.path.exists(cur_file_path): stored_g = self.load(cur_file_path, cur_g, tmp_dir) for s, p, o, g in final_g.quads((None, None, None, None)): stored_g.remove((s, p, o, g)) final_g = stored_g elif not override: # Merging the data if cur_file_path in already_processed: stored_g = already_processed[cur_file_path] stored_g.addN(final_g.quads((None, None, None, None))) final_g = stored_g elif os.path.exists(cur_file_path): # This is a conjunctive graps that contains all the triples (and graphs) # the file is actually defining - they could be more than those using # 'cur_subject' as subject. final_g = self.load(cur_file_path, cur_g, tmp_dir) already_processed[cur_file_path] = final_g if store_now: self.__store_in_file(final_g, cur_file_path, context_path) return already_processed except Exception as e: self.reperr.add_sentence( "[5] It was impossible to store the RDF statements in %s. %s" % (cur_file_path, str(e))) return None def __get_context(self, context_url): if context_url in self.context_map: return self.context_map[context_url] else: return context_url def __get_first_context(self): for context_url in self.context_map: return self.context_map[context_url] def load(self, rdf_file_path, cur_graph=None, tmp_dir=None): self.repok.new_article() self.reperr.new_article() if os.path.isfile(rdf_file_path): Storer.hack_dates() # The line above has been added for handling gYear and gYearMonth correctly. # More info at https://github.com/RDFLib/rdflib/issues/806. try: cur_graph = self.__load_graph(rdf_file_path, cur_graph) except IOError: if tmp_dir is not None: current_file_path = tmp_dir + os.sep + "tmp_rdf_file.rdf" shutil.copyfile(rdf_file_path, current_file_path) try: cur_graph = self.__load_graph(current_file_path, cur_graph) except IOError as e: self.reperr.add_sentence( "[2] " "It was impossible to handle the format used for " "storing the file (stored in the temporary path) '%s'. " "Additional details: %s" % (current_file_path, str(e))) os.remove(current_file_path) else: self.reperr.add_sentence( "[3] " "It was impossible to try to load the file from the " "temporary path '%s' since that has not been specified in " "advance" % rdf_file_path) else: self.reperr.add_sentence( "[4] " "The file specified ('%s') doesn't exist." % rdf_file_path) return cur_graph def __load_graph(self, file_path, cur_graph=None): formats = ["json-ld", "rdfxml", "turtle", "trig", "nt11", "nquads"] current_graph = ConjunctiveGraph() if cur_graph is not None: current_graph.parse(data=cur_graph.serialize(format="trig"), format="trig") for cur_format in formats: try: if cur_format == "json-ld": with open(file_path) as f: json_ld_file = json.load(f) if isinstance(json_ld_file, dict): json_ld_file = [json_ld_file] for json_ld_resource in json_ld_file: # Trick to force the use of a pre-loaded context if the format # specified is JSON-LD context_json = None if "@context" in json_ld_resource: cur_context = json_ld_resource["@context"] if cur_context in self.context_map: context_json = self.__get_context( cur_context)["@context"] json_ld_resource["@context"] = context_json current_graph.parse(data=json.dumps( json_ld_resource, ensure_ascii=False), format=cur_format) else: current_graph.parse(file_path, format=cur_format) return current_graph except Exception as e: errors = " | " + str(e) # Try another format raise IOError( "1", "It was impossible to handle the format used for storing the file '%s'%s" % (file_path, errors))
"-i", "--input", dest="input", required=True, help="The file containing the RDF to execute, the JSON-LD to upload, " "or a directory containing several files with both queries and RDF.") args = arg_parser.parse_args() if args.conf is not None: my_conf = __import__(args.conf) for attr in dir(my_conf): if not attr.startswith("__"): globals()[attr] = getattr(my_conf, attr) storer = Storer(repok=Reporter(True), reperr=Reporter(True), context_map={context_path: context_file_path}) all_files = [] if os.path.isdir(args.input): for cur_dir, cur_subdir, cur_files in os.walk(args.input): for cur_file in cur_files: full_path = cur_dir + os.sep + cur_file if re.search(os.sep + "prov" + os.sep, full_path) is None and \ not full_path.endswith("index.json"): all_files += [full_path] else: all_files += [args.input] for cur_file in all_files:
class FormatProcessor(object): #doi_pattern = "[^A-z0-9\.]([0-9]+\.[0-9]+(\.[0-9]+)*/[^%\"# \?<>{}\^\[\]`\|\\\+]+)" doi_pattern = "[^A-z0-9\.](10\.[0-9]+(\.[0-9]+)*/[^%\"# \?<>{}\^\[\]`\|\\\+]+)" http_pattern = "(https?://([A-z]|[0-9]|%|&|\?|/|\.|_|~|-|:)+)" """This class is the abstract one for any kind of processors.""" def __init__(self, base_iri, context_base, info_dir, entries, n_file_item, supplier_prefix, agent_id=None): self.occ = None self.doi = None self.pmid = None self.pmcid = None self.url = None self.curator = None self.source = None self.source_provider = None self.entries = None self.reference_pointers = None if entries is not None: if "occ" in entries: self.occ = entries["occ"] if "doi" in entries: self.doi = entries["doi"].lower() if "pmid" in entries: self.pmid = entries["pmid"] if "pmcid" in entries: self.pmcid = entries["pmcid"] if "url" in entries: self.url = entries["url"].lower() if "curator" in entries: self.curator = entries["curator"] if "source" in entries: self.source = entries["source"] if "source_provider" in entries: self.source_provider = entries["source_provider"] if "references" in entries: self.entries = entries["references"] if "reference_pointers" in entries: self.reference_pointers = entries["reference_pointers"] self.name = "SPACIN " + self.__class__.__name__ self.g_set = GraphSet(base_iri, context_base, info_dir, n_file_item, supplier_prefix, wanted_label=False) # added no label param self.id = agent_id self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.repok.new_article() self.reperr = Reporter(prefix="[%s - ERROR] " % self.name) self.reperr.new_article() def process(self): pass # Implemented in the subclasses def graph_set(self): return self.g_set def graphs(self): return self.g_set.graphs() def message(self, mess): return "%s" % mess @staticmethod def clean_entry(entry): return quote(sa(re.sub(":", ",", entry))) @staticmethod def extract_data(string, pattern): if string is not None: result = re.search(pattern, string) if result: return result.group(1) @staticmethod def extract_doi(string): if string is not None: result = FormatProcessor.extract_data(string, FormatProcessor.doi_pattern) if result: result = re.sub("(\.|,)?$", "", result) return result @staticmethod def extract_url(string): if string is not None: result = FormatProcessor.extract_data(string, FormatProcessor.http_pattern) if result: result = re.sub("\\\\", "", re.sub("/?\.?$", "", result)) return result
class ORCIDFinder(object): __api_url = "https://pub.orcid.org/v2.1/search?q=" __personal_url = "https://pub.orcid.org/v2.1/%s/personal-details" def __init__(self, conf_file, sec_to_wait=10, max_iteration=6, timeout=30): with open(conf_file) as f: conf_json = json.load(f) self.headers = { "Authorization": "Bearer %s" % conf_json["access_token"], "Content-Type": "application/json" } self.id = "ORCID" self.name = "SPACIN " + self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.__last_query_done = None self.sec_to_wait = sec_to_wait self.max_iteration = max_iteration self.timeout = timeout def get_last_query(self): return self.__last_query_done def get_orcid_data(self, orcid_string): self.repok.new_article() self.reper.new_article() self.__last_query_done = ORCIDFinder.__personal_url % orcid_string return get_data(self.max_iteration, self.sec_to_wait, self.__last_query_done, self.headers, self.timeout, self.repok, self.reper) def get_orcid_records(self, doi_string, family_names=[]): self.repok.new_article() self.reper.new_article() cur_query = "digital-object-ids:\"%s\"" % doi_string if family_names: cur_query += " AND (" first_name = True for idx, family_name in enumerate(family_names): if family_name is not None: if first_name: first_name = False else: cur_query += " OR " cur_query += "family-name:\"%s\"" % na("" + family_name) cur_query += ")" self.__last_query_done = ORCIDFinder.__api_url + quote(cur_query) return get_data(self.max_iteration, self.sec_to_wait, self.__last_query_done, self.headers, self.timeout, self.repok, self.reper) def get_orcid_ids(self, doi_string, family_names=[]): result = [] records = self.get_orcid_records(doi_string, family_names) if records is not None: for orcid_id in dg(records, ["result", "orcid-identifier", "path"]): personal_details = self.get_orcid_data(orcid_id) if personal_details is not None: given_name = dg(personal_details, ["name", "given-names", "value"]) family_name = dg(personal_details, ["name", "family-name", "value"]) credit_name = dg(personal_details, ["name", "credit-name", "value"]) other_names = dg(personal_details, ["other-names", "other-name", "content"]) result += [ da({ "orcid": orcid_id, "given": given_name, "family": family_name, "credit": credit_name, "other": other_names }) ] return result
description="This script create an nt file given a directory " "of the OCC containing data") arg_parser.add_argument("-i", "--input", dest="input", required=True, help="The directory containing the json-ld data.") arg_parser.add_argument("-o", "--output", dest="output", required=True, help="The output file.") args = arg_parser.parse_args() repok = Reporter(True, prefix="[creatent.py: INFO] ") reperr = Reporter(True, prefix="[creatent.py: ERROR] ") repok.new_article() reperr.new_article() s = Storer(context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir) for cur_dir, cur_subdir, cur_files in os.walk(args.input): with open(args.output, 'a') as f: for cur_file in cur_files: if match("^[0-9]+\.json", cur_file) is not None: cur_g = s.load(cur_dir + os.sep + cur_file, tmp_dir=temp_dir_for_rdf_loading)
class ORCIDFinder(object): __api_url = "https://pub.orcid.org/v2.1/search?q=" __personal_url = "https://pub.orcid.org/v2.1/%s/personal-details" def __init__(self, conf_file, sec_to_wait=10, max_iteration=6, timeout=30, query_interface='remote'): with open(conf_file) as f: conf_json = json.load(f) self.headers = { "Authorization": "Bearer %s" % conf_json["access_token"], "Content-Type": "application/json" } self.id = "ORCID" self.name = "SPACIN " + self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.__last_query_done = None self.sec_to_wait = sec_to_wait self.max_iteration = max_iteration self.timeout = timeout if query_interface == 'local': self.query_interface = LocalQuery(reperr=self.reper, repok=self.repok) elif query_interface == 'remote': self.query_interface = RemoteQuery(max_iteration=max_iteration, sec_to_wait=sec_to_wait, timeout=timeout, headers=self.headers, reperr=self.reper, repok=self.repok, is_json=True) else: raise ValueError( "query_interface param must be `local` or `remote`") def get_last_query(self): return self.__last_query_done def get_orcid_data(self, orcid_string): self.repok.new_article() self.reper.new_article() self.__last_query_done = ORCIDFinder.__personal_url % orcid_string print(self.__last_query_done) return self.query_interface.get_orcid_data(orcid_string) def get_orcid_records(self, doi_string, family_names=[]): self.repok.new_article() self.reper.new_article() # If we're making a local query, we only need to use the doi string if isinstance(self.query_interface, LocalQuery): return self.query_interface.get_orcid_records(doi_string.lower()) # Otherwise we need to setup the query in ther format that follows else: cur_query = "doi-self:\"%s\"" % doi_string doi_string_l = doi_string.lower() doi_string_u = doi_string.upper() if doi_string_l != doi_string or doi_string_u != doi_string: cur_query = "(" + cur_query if doi_string_l != doi_string: cur_query += " OR doi-self:\"%s\"" % doi_string_l if doi_string_u != doi_string: cur_query += " OR doi-self:\"%s\"" % doi_string_u cur_query += ")" if family_names: cur_query += " AND (" first_name = True for idx, family_name in enumerate(family_names): if family_name is not None: if first_name: first_name = False else: cur_query += " OR " cur_query += "family-name:\"%s\"" % na("" + family_name) cur_query += ")" self.__last_query_done = ORCIDFinder.__api_url + quote(cur_query) returned_data = self.query_interface.get_orcid_records( quote(cur_query)) return returned_data def get_orcid_ids(self, doi_string, family_names=[]): result = [] records = self.get_orcid_records(doi_string, family_names) if records is not None: if isinstance(self.query_interface, RemoteQuery): for orcid_id in dg(records, ["result", "orcid-identifier", "path"]): personal_details = self.get_orcid_data(orcid_id) if personal_details is not None: given_name = dg(personal_details, ["name", "given-names", "value"]) family_name = dg(personal_details, ["name", "family-name", "value"]) credit_name = dg(personal_details, ["name", "credit-name", "value"]) other_names = dg( personal_details, ["other-names", "other-name", "content"]) result += [ da({ "orcid": orcid_id, "given": given_name, "family": family_name, "credit": credit_name, "other": other_names }) ] else: for author in records: result += [ da({ "orcid": author['orcid'], "given": author['given_names'], "family": author['family_name'], "credit": "", # actually we don't manage this "other": "" # actually we don't manage this }) ] return result