def add_authorship_links(): """ Create people to authorship links. """ logger.info("Adding additional authorships.") vstore = backend.get_store() while True: logger.info("Fetching people to authorship batch.") q = rq_prefixes + """ CONSTRUCT { ?aship vivo:relates ?p . } WHERE { ?p a foaf:Person ; wos:daisNg ?dais . ?aship a vivo:Authorship ; wos:daisNg ?dais . FILTER NOT EXISTS { ?aship vivo:relates ?p } } LIMIT 500 """ logger.info("Authorship query:\n" + q) try: g = vstore.query(q).graph vstore.bulk_add(PEOPLE_AUTHORSHIP, g) except ResultException: break return True
def process(named_graph, batch, dry=False, sleep=10): while True: vstore = backend.get_store() logger.info("Querying {} for triples to remove.".format(named_graph)) q = """ CONSTRUCT { ?s ?p ?o . } WHERE { GRAPH <?g> { ?s ?p ?o . } } LIMIT ?batch """.replace("?g", named_graph).replace("?batch", batch) logger.info(q) try: rsp = vstore.query(q) g = rsp.graph num_found = len(g) except ResultException: logger.info("No triples to remove.") break logger.info("Removing {} triples from {}.".format( num_found, named_graph)) rm = vstore.bulk_remove(named_graph, g) #if num_found < batch: # break if sleep > 0: logger.info("Sleeping between batches.") time.sleep(sleep) return True
def process(clean_file, dry=False): vstore = backend.get_store() name_key = dict() # Map clean names with open(clean_file) as inf: inf.next() for n, row in enumerate(csv.reader(inf)): pname, ename = row[0].strip(), row[1].strip() if ename == "": continue logger.info("Cleaning {} with variant {}.".format(pname, ename)) # Always create variant for the preferred name too. name_key[pname] = pname name_key[ename] = pname addg, removeg = index_orgs(name_key) graphs = [ADDRESS_GRAPH, SUBORG_GRAPH] # Remove from these graphs for g in graphs: logger.info("Removing preferred name triples with {} triples from {} graph.".format(len(removeg), g)) rm2 = vstore.bulk_remove(g, removeg) logger.info("Adding preferred name triples with {} triples.".format(len(addg))) add = vstore.bulk_add(, addg)
def build_dtu_people(): """ Relate person entities to unified organizations. """ logger.info("Adding DTUResearcher type.") vstore = backend.get_store() q = rq_prefixes + """ CONSTRUCT { ?person a wos:DTUResearcher } WHERE { d:org-technical-university-of-denmark a wos:UnifiedOrganization ; vivo:relatedBy ?address . ?address a wos:Address ; vivo:relatedBy ?authorship ; vivo:relates d:org-technical-university-of-denmark . ?authorship a vivo:Authorship ; vivo:relates ?person . ?person a foaf:Person . } """ logger.info("DTU people query:\n" + q) g = vstore.query(q).graph vstore.bulk_add(AFFILIATION_NG, g)
def build_unified_affiliation(): """ Relate person entities to unified organizations. """ logger.info("Adding unified affiliation.") vstore = backend.get_store() while True: q = rq_prefixes + """ CONSTRUCT { ?person a wos:ExternalResearcher . ?person wos:hasAffiliation ?unifOrg . } WHERE { ?unifOrg a wos:UnifiedOrganization ; vivo:relatedBy ?address . ?address a wos:Address ; vivo:relatedBy ?authorship ; vivo:relates ?unifOrg . ?authorship a vivo:Authorship ; vivo:relates ?person . ?person a foaf:Person . FILTER (?unifOrg != d:org-technical-university-of-denmark) FILTER NOT EXISTS { ?person a wos:ExternalResearcher } } LIMIT 500 """ logger.info("Affiliation query:\n" + q) try: g = vstore.query(q).graph vstore.bulk_add(AFFILIATION_NG, g) except ResultException: break
def get_unified_orgs(): q = rq_prefixes + """ select ?wosU ?org where { ?wosU a wos:UnifiedOrganization ; rdfs:label ?org . } """ vstore = backend.get_store() out = [] for row in vstore.query(q): out.append((row.wosU.toPython(), row.org.toPython())) return out
def get_journals(): q = rq_prefixes + """ select ?j ?issn where { ?j bibo:issn ?issn . } """ vstore = backend.get_store() d = {} for row in vstore.query(q): d[row.issn.toPython()] = row.j return d
def get_wos_pubs(): vstore = backend.get_store() rq = rq_prefixes + """ select ?pub ?wosId where { ?pub wos:wosId ?wosId . } """ d = {} logger.debug(rq) for row in vstore.query(rq): logger.debug(row) d[row.wosId.toPython().replace('WOS:', '')] = row.pub return d
def get_existing_people(): logger.info("Getting existing profiles.") q = rq_prefixes + """ select ?p where { ?p a foaf:Person } """ vstore = backend.get_store() out = [] for row in vstore.query(q): out.append(row.p) return out
def get_existing_address(uri): vstore = backend.get_store() rq = rq_prefixes + """ SELECT ?address WHERE { ?uri vivo:relatedBy ?address. ?address a wos:Address . } """ rmg = Graph() addr_uris = [] for row in vstore.query(rq, initBindings={'uri': uri}): addr_uris.append(row.address) rmg.add((row.address, VIVO.relates, uri)) rmg.add((uri, VIVO.relatedBy, row.address)) return addr_uris, rmg
def build_email_profiles(): """ Builds profiles for researchers with emails and a minimum number of publications. """ q = rq_prefixes + """ select (COUNT(?aship) as ?num) ?email (SAMPLE(?fullName) AS ?name) (group_concat(distinct ?fullName ; separator = "|") AS ?full_names) (SAMPLE(?first) AS ?firstName) (SAMPLE(?last) AS ?lastName) (group_concat(distinct ?daisNg ; separator = "|") AS ?dais) where { ?aship a vivo:Authorship ; wos:fullName ?fullName ; rdfs:label ?label ; wos:daisNg ?daisNg ; wos:email ?email ; wos:firstName ?first ; wos:lastName ?last . FILTER NOT EXISTS { ?p a foaf:Person ; wos:daisNg ?daisNg . } } GROUP BY ?email HAVING (?num >= 3) ORDER BY DESC(?num) """ logger.info("Email profiles query:\n" + q) vstore = backend.get_store() existing = get_existing_people() g = Graph() for person in vstore.query(q): name = person.name.toPython() email = person.email.toPython() dais_ids = [d for d in person.dais.toPython().split("|")] logger.info("Building profile for {} with {}.".format(name, email)) vper = Researcher(person, dais_ids) if vper.uri in existing: logger.info("Profile exists for {}.".format(vper.uri)) continue g += vper.to_rdf() vstore.bulk_add(PEOPLE_EMAIL_GRAPH, g)
def remove_internal_external(): """ Remove the wos:ExternalResearcher class from those that are also DTU researchers. """ logger.info("Removing external researcher from internal researchers.") vstore = backend.get_store() q = rq_prefixes + """ CONSTRUCT { ?r a wos:ExternalResearcher . } WHERE { ?r a wos:ExternalResearcher, wos:DTUResearcher . } """ try: g = vstore.query(q).graph vstore.bulk_remove(AFFILIATION_NG, g) except ResultException: pass
def index_orgs(name_key): uri_pubs = defaultdict(list) idx = defaultdict(lambda: defaultdict(dict)) vstore = backend.get_store() rmg = Graph() addg = Graph() q = rq_prefixes + """ SELECT DISTINCT ?org ?name WHERE { ?org a wos:SubOrganization ; rdfs:label ?label ; wos:subOrganizationName ?name ; vivo:relatedBy ?address . ?address a wos:Address ; vivo:relates ?pub, d:org-technical-university-of-denmark . } """ for row in vstore.query(q): #print row.org, row.name existing_name = row.name pname = name_key.get(existing_name.toPython()) if pname is not None: logger.info("Processing existing name {} to clean name {}.".format(existing_name, pname)) addr_uris, to_remove = get_existing_address(row.org) rmg += to_remove new_uri = slug_uri(pname, prefix="dtusuborg") addg.add((new_uri, RDF.type, WOS.SubOrganization)) addg.add((new_uri, RDFS.label, Literal(pname))) addg.add((new_uri, WOS.subOrganizationName, Literal(pname))) addg.add((new_uri, WOS.subOrganizationNameVariant, existing_name)) for auri in addr_uris: addg.add((auri, VIVO.relates, new_uri)) return addg, rmg
def process(triple_files, format="nt", dry=False, sync=False, sleep=10, size=DEFAULT_BATCH_SIZE): vstore = backend.get_store() for fpath in triple_files: g = Graph() g.parse(source=fpath, format=format) named_graph = NG_BASE + fpath.split("/")[-1].split(".")[0] logger.info("Processing updates with {} triples to {} and batch size {}.".format(len(g), named_graph, size)) if dry is True: logger.info("Dry run. No changes made.") else: if sync is True: logger.info("Syncing graph to {}.".format(named_graph)) added, removed = backend.sync_updates(named_graph, g, size=size) else: logger.info("Posting graph as updates to {}.".format(named_graph)) added = vstore.bulk_add(named_graph, g, size=size) removed = 0 if (added > 0) or (removed > 0): if sleep > 0: logger.info("Sleeping for {} seconds between files.".format(sleep)) time.sleep(sleep) else: logger.info("No changes made to {}.".format(named_graph)) return True
'RUSSIA': 'russian-federation', 'VENEZUELA': 'venezuela-bolivarian-republic-of', 'BOLIVIA': 'bolivia', 'CZECH REPUBLIC': 'czech-republic', 'IVORY COAST': 'ivory-cost', 'TAIWAN': 'china', 'SOUTH KOREA': 'south-korea', 'VIETNAM': 'viet-nam', 'IRAN': 'iran-islamic-republic-of', 'REPUBLIC OF GEORGIA': 'georgia', 'REUNION': 'reunion', 'TANZANIA': 'united-republic-of-tanzania', 'BOSNIA & HERZEGOVINA': 'bosnia-herzegovina', } store = backend.get_store() def mk_slug(raw): clean = raw.strip().lower() return slugify(clean) def fetch_vivo_countries(): q = rq_prefixes + """ SELECT ?uri ?label ?code WHERE { ?uri a vivo:Country ; rdfs:label ?label ; <http://aims.fao.org/aos/geopolitical.owl#codeISO3> ?code . }
def build_dais_profiles(): """ Builds profiles for researchers by DAIS and a minimum number of publications. """ with open(ORCID_FILE) as inf: d_to_o = json.load(inf) with open(RID_FILE) as inf: d_to_r = json.load(inf) with open(AU_ID_FILE) as inf: auid_to_dais = json.load(inf) q = rq_prefixes + """ select (COUNT(?aship) as ?num) ?dais (group_concat(distinct ?fullName ; separator = "|") AS ?full_names) (SAMPLE(?fullName) AS ?name) (SAMPLE(?first) AS ?firstName) (SAMPLE(?last) AS ?lastName) where { ?aship a vivo:Authorship ; vivo:relates ?addr ; wos:fullName ?fullName ; rdfs:label ?label ; wos:daisNg ?dais ; wos:firstName ?first ; wos:lastName ?last . } GROUP BY ?dais HAVING (?num >= 20) ORDER BY DESC(?num) """ logger.info("DAIS profiles query:\n" + q) vstore = backend.get_store() g = Graph() for person in vstore.query(q): name = person.name.toPython() dais = person.dais.toPython() # Add RID, ORCID and additional DAIS if possible. orcids = d_to_o.get(dais, [None]) rids = d_to_r.get(dais, [None]) if len(orcids) > 1: orcid = None else: orcid = orcids[0] if len(rids) > 1: rid = None else: rid = rids[0] person_uri = D['person-' + dais] dais_ids = [dais] if orcid is not None: dais_ids += auid_to_dais.get(orcid, []) elif rid is not None: dais_ids += auid_to_dais.get(rid, []) dais_ids = [dais] logger.info("Building profile for {} with {}.".format(name, dais)) vper = Researcher(person, dais_ids) g += vper.to_rdf() if orcid is not None: g.add((vper.uri, WOS.orcid, Literal(orcid))) if rid is not None: g.add((vper.uri, WOS.orcid, Literal(orcid))) vstore.sync_named_graph(PEOPLE_GRAPH, g)
def build_orcid_rid_profiles(): """ Builds profiles for researchers with RIDs or ORCIDs. """ q = rq_prefixes + """ select (COUNT(?aship) as ?num) ?dais (group_concat(distinct ?fullName ; separator = "|") AS ?full_names) (SAMPLE(?fullName) AS ?name) (SAMPLE(?first) AS ?firstName) (SAMPLE(?last) AS ?lastName) where { ?aship a vivo:Authorship ; wos:fullName ?fullName ; rdfs:label ?label ; wos:daisNg ?dais ; wos:firstName ?first ; wos:lastName ?last ; vivo:relates ?addr . } GROUP BY ?dais #HAVING (?num >= 3) """ logger.info("Author ID profiles query:\n" + q) vstore = backend.get_store() with open(ORCID_FILE) as inf: d_to_o = json.load(inf) with open(RID_FILE) as inf: d_to_r = json.load(inf) with open(AU_ID_FILE) as inf: auid_to_dais = json.load(inf) existing = get_existing_people() done = [] g = Graph() for person in vstore.query(q): orcid = None rid = None dais = person.dais.toPython() if dais in done: continue name = person.name.toPython() orcids = d_to_o.get(dais) rids = d_to_r.get(dais) if orcids is not None: if len(orcids) > 1: logger.info("Ignoring {}. Multiple ORCIDs found.".format(dais)) continue orcid = orcids[0] elif rids is not None: if len(rids) > 1: logger.info("Ignoring {}. Multiple RIDs found.".format(dais)) continue rid = rids[0] else: logger.info("Skipping {} - no RID or ORCID".format(dais)) continue dais_ids = auid_to_dais[orcid or rid] for did in dais_ids: if did in done: continue logger.info("Building profile for {} with {}.".format( name, orcid or rid)) done += dais_ids vper = Researcher(person, dais_ids) if vper.uri in existing: logger.info("Profile exists with URI {}.".format(vper.uri)) continue if orcid is not None: g.add((vper.uri, WOS.orcid, Literal(orcid))) elif rid is not None: g.add((vper.uri, VIVO.researcherId, Literal(rid))) g += vper.to_rdf() vstore.bulk_add(PEOPLE_IDENTIFIERS_GRAPH, g)