def position_start_triples(per_uri, dept, DEPT_URIS, DEFAULT_ORG_URI, g): new_pos_uri = uri_gen('n', g) new_dtint_uri, new_dtstart_uri = uri_gen('n', g), uri_gen('n', g) g.add((D[new_pos_uri], RDF.type, VIVO.NonAcademicPosition)) g.add((D[new_pos_uri], RDFS.label, Literal(title))) g.add((D[new_pos_uri], VIVO.relates, D[per_uri['id']])) g.add((D[per_uri['id']], VIVO.relatedBy, D[new_pos_uri])) g.add((D[new_pos_uri], VIVO.dateTimeInterval, D[new_dtint_uri])) g.add((D[new_dtint_uri], RDF.type, VIVO.DateTimeInterval)) g.add((D[new_dtint_uri], VIVO.start, D[new_dtstart_uri])) g.add((D[new_dtstart_uri], RDF.type, VIVO.DateTimeValue)) g.add( (D[new_dtstart_uri], VIVO.dateTime, Literal(date, datatype=XSD.dateTime))) if dept: dept = dept.split(' ')[0].split('-')[0] if dept in DEPT_URIS: g.add((D[new_pos_uri], VIVO.relates, D[DEPT_URIS[dept]])) else: g.add((D[new_pos_uri], VIVO.relates, URIRef(DEFAULT_ORG_URI))) log.warn('{} department/organization is unknown.'.format(dept)) else: g.add((D[new_pos_uri], VIVO.relates, URIRef(DEFAULT_ORG_URI))) log.warn('{} does not appear to belong to a department'.format(name)) return new_pos_uri
def position_end_triples(per_info, g): if 'dtint' not in per_info: dtint_uri = uri_gen('n', g) g.add((D[dtint_uri], RDF.type, VIVO.DateTimeInterval)) g.add((URIRef(per_info['position']['value']), VIVO.dateTimeInterval, D[dtint_uri])) else: dtint_uri = per_info['dtint']['value'] dtend_uri = uri_gen('n', g) g.add((URIRef(dtint_uri), VIVO.end, D[dtend_uri])) g.add((D[dtend_uri], RDF.type, VIVO.DateTimeValue)) g.add((D[dtend_uri], VIVO.dateTime, Literal(date, datatype=XSD.dateTime)))
log.warning('{} was returned by GSAC twice, this may happen if there ' 'are multiple data intervals for a single ' 'station.'.format(chID)) # Station is in VIVO and is decommissioned, but not listed so in VIVO elif(in_vivo_list[chID] is None and station['Status']['Id'] == 'decomissioned' and station['ShortName'] not in donethat): dt = station['ToDate'] # Add a prefix if the chID starts with a number if chID[0].isdigit(): chID = 'n' + chID dt = time.strptime(dt, "%b %d, %Y %I:%M:%S %p") dt = time.strftime("%Y-%m-%dT%H:%M:%S", dt) dt_uri = uri_gen('n', g) g.add((D[chID], VIVO.dateTimeValue, D[dt_uri])) g.add((D[dt_uri], RDF.type, VIVO.DateTimeValue)) g.add((D[dt_uri], VIVO.dateTime, Literal(dt, datatype=XSD.dateTime))) g.add((D[dt_uri], VIVO.dateTimePrecision, VIVO.yearMonthDayPrecision)) donethat.append(chID) log.info("Retired: {} on {}".format(station['ShortName'], station['ToDate'])) timestamp = str(datetime.now())[:-7] if len(g) > 0: try: with open("rdf/station-update-"+timestamp+"-in.ttl", "w") as f:
while True: grants = call_nsf_api(SEARCH_KEYWORD, START_DATE, offset) if grants: log.info('NSF API returned {} grants.'.format(len(grants))) for grant in grants: if 'fundProgramName' in grant: if grant['fundProgramName'] == 'POSTDOCTORAL FELLOWSHIPS': break nsf_id = grant['id'] log.debug('Found grant #' + nsf_id + ' titled ' + grant['title']) if nsf_id not in q_info: log.info('Grant #' + nsf_id + ' not found in Connect UNAVCO ' 'database. Adding triples.') award_uri, time_int_uri = uri_gen('awd'), uri_gen('n') start_uri, end_uri = uri_gen('n'), uri_gen('n') g.add((D[award_uri], RDF.type, VIVO.Grant)) g.add((D[award_uri], RDFS.label, Literal(grant['title'], datatype=XSD.string))) g.add((D[award_uri], BIBO.abstract, Literal(grant['abstractText']))) g.add((D[award_uri], VIVO.sponsorAwardId, Literal(grant['id']))) g.add((D[award_uri], VIVO.totalAwardAmount, Literal("${:,.2f}" .format(float(grant['fundsObligatedAmt']))))) if 'agency' in grant: if grant['agency'] == 'NSF': g.add((D[award_uri], VIVO.assignedBy, D[NSF_ID])) elif grant['agency'] == 'NASA': g.add((D[award_uri], VIVO.assignedBy, D[NASA_ID]))
log.info(u'Invalid input, try again.') elif org_uri == 'new': org_uri = None break else: org_uri = URIRef(org_uri) if ringgold: g.add((org_uri, VLOCAL.ringgoldID, Literal(ringgold))) g_orgs.add((org_uri, VLOCAL.ringgoldID, Literal(ringgold))) break # If we've made it this far just add a new organization to VIVO if not org_uri: org_uri = D[uri_gen('org')] log.info('Adding organization with URI {}'.format(org_uri)) if 'University' in organization: g.add((org_uri, RDF.type, VIVO.University)) elif 'College' in organization: g.add((org_uri, RDF.type, VIVO.College)) else: g.add((org_uri, RDF.type, FOAF.Organization)) g.add((org_uri, RDFS.label, Literal(organization))) g.add((org_uri, RDFS.label, Literal(organization))) if ringgold: g.add((org_uri, VLOCAL.ringgoldID, Literal(ringgold))) new_pos_uri = D[uri_gen('n', g) + put_code] if affiliation["type"] == "EMPLOYMENT": g.add((new_pos_uri, RDF.type, VIVO.Position))
def process_doi(doi, matchlist): # Grab full metadata for the doi in json format print('Processing {}'.format(doi)) attr = data_api_lookup(doi.replace('10.7283/', '')) # Publication type; coming from UNAVCO data API so assume it's a dataset pubtype = VIVO.Dataset pub_uri = uri_gen('dat') # Article info if "title" in attr: title = attr['title'].strip() if 'INTERFEROGRAM' in title: g.add((D[pub_uri], EC.hasDatasetType, D['n803942'])) elif 'TLS' in title: g.add((D[pub_uri], EC.hasDatasetType, D['n471427'])) else: g.add((D[pub_uri], EC.hasDatasetType, D['n546123'])) else: title = None # Authors authors = parse_authors_datacite(attr['creators']) # Publication date pub_year = (attr['publicationYear'] if 'publicationYear' in attr else None) date_uri = uri_gen('n') g.add((D[pub_uri], VIVO.dateTimeValue, D[date_uri])) add_date(D[date_uri], pub_year, g) # Add things to the graph if pubtype: g.add((D[pub_uri], RDF.type, pubtype)) g.add((D[pub_uri], BIBO.doi, Literal(doi))) if title: g.add((D[pub_uri], RDFS.label, Literal(title))) # Loop through the list of authors, trying to check for existing # authors in the database if authors: for idx, (first_name, surname) in enumerate(authors): full_name = join_if_not_empty((first_name, surname)) rank = idx + 1 if full_name in matchlist[0]: pos = matchlist[0].index(full_name) assign_authorship(matchlist[1][pos], g, pub_uri, full_name, matchlist, rank) else: roll = name_lookup(surname) matchlist = name_selecter(roll, full_name, g, first_name, surname, pub_uri, matchlist, rank) if "relatedIdentifiers" in attr: if attr['relatedIdentifiers']: print("Related DOIs: {}".format(attr['relatedIdentifiers'])) for rel_doi in attr['relatedIdentifiers']: if rel_doi in datasets_in_vivo[0]: rel_uri = (datasets_in_vivo[1][datasets_in_vivo[0].index( rel_doi)]) # Try the local graph else: rel_uri = next(g.subjects(BIBO.doi, Literal(rel_doi)), None) # All related DOIs are assumed to be children if rel_uri: g.add((URIRef(rel_uri), OBO.BFO_0000050, D[pub_uri])) g.add((D[pub_uri], OBO.BFO_0000051, URIRef(rel_uri))) else: if pub_uri in orphans: orphans[pub_uri].append(rel_doi) else: orphans[pub_uri] = [rel_doi] if "relatedPublications" in attr: if attr['relatedPublications']: print("Found related pubs, but there isn't support for this (yet)") # print(attr['relatedPublications']) if "stationCode" in attr: if attr['stationCode']: # dataset obo:RO_0002353 station # station obo:RO_0002234 dataset if stations_in_vivo[attr['stationCode']]: g.add((D[pub_uri], OBO.RO_0002353, URIRef(stations_in_vivo[attr['stationCode']]))) g.add((URIRef(stations_in_vivo[attr['stationCode']]), OBO.RO_0002234, D[pub_uri])) else: print("Ruh roh, could not find URI for station {}".format( attr['stationCode'])) with open('matchlistfile.pickle', 'wb') as f: pickle.dump(matchlist, f)
for row in csv_f: doi = row[0] if doi not in datasets_in_vivo[0]: # It's not already in VIVO # Grab full metadata for the doi in json format cr_result = datacite_lookup(doi) print('\nProcessing ' + doi + '\n') if cr_result: # Publication type if cr_result["resourceTypeGeneral"] == 'Dataset': pubtype = VIVO.Dataset else: pubtype = None print('Not a Dataset type: ' + doi + '. Skipping@!') continue pub_uri = uri_gen('dat') # Article info subjects = cr_result["subject"] if "subject" in cr_result else None if "title" in cr_result: if cr_result["title"][0]: s = ", " title = s.join(cr_result["title"]) if 'INTERFEROGRAM' in title: g.add((D[pub_uri], EC.hasDatasetType, D['n803942'])) elif 'TLS' in title: g.add((D[pub_uri], EC.hasDatasetType, D['n471427'])) else: g.add((D[pub_uri], EC.hasDatasetType, D['n546123']))
'are multiple data intervals for a single ' 'station.'.format(chID)) # Station is in VIVO and is decommissioned, but not listed so in VIVO elif (in_vivo_list[chID] is None and station['Status']['Id'] == 'decomissioned' and station['ShortName'] not in donethat): dt = station['ToDate'] # Add a prefix if the chID starts with a number if chID[0].isdigit(): chID = 'n' + chID dt = time.strptime(dt, "%b %d, %Y %I:%M:%S %p") dt = time.strftime("%Y-%m-%dT%H:%M:%S", dt) dt_uri = uri_gen('n', g) g.add((D[chID], VIVO.dateTimeValue, D[dt_uri])) g.add((D[dt_uri], RDF.type, VIVO.DateTimeValue)) g.add((D[dt_uri], VIVO.dateTime, Literal(dt, datatype=XSD.dateTime))) g.add((D[dt_uri], VIVO.dateTimePrecision, VIVO.yearMonthDayPrecision)) donethat.append(chID) log.info("Retired: {} on {}".format(station['ShortName'], station['ToDate'])) timestamp = str(datetime.now())[:-7] if len(g) > 0: try: with open("rdf/station-update-" + timestamp + "-in.ttl", "w") as f:
while True: grants = call_nsf_api(SEARCH_KEYWORD, START_DATE, offset) if grants: log.info('NSF API returned {} grants.'.format(len(grants))) for grant in grants: if 'fundProgramName' in grant: if grant['fundProgramName'] == 'POSTDOCTORAL FELLOWSHIPS': break nsf_id = grant['id'] log.debug('Found grant #' + nsf_id + ' titled ' + grant['title']) if nsf_id not in q_info: log.info('Grant #' + nsf_id + ' not found in Connect UNAVCO ' 'database. Adding triples.') award_uri, time_int_uri = uri_gen('awd'), uri_gen('n') start_uri, end_uri = uri_gen('n'), uri_gen('n') g.add((D[award_uri], RDF.type, VIVO.Grant)) g.add((D[award_uri], RDFS.label, Literal(grant['title'], datatype=XSD.string))) g.add((D[award_uri], BIBO.abstract, Literal(grant['abstractText']))) g.add( (D[award_uri], VIVO.sponsorAwardId, Literal(grant['id']))) g.add((D[award_uri], VIVO.totalAwardAmount, Literal("${:,.2f}".format( float(grant['fundsObligatedAmt']))))) if 'agency' in grant: if grant['agency'] == 'NSF': g.add((D[award_uri], VIVO.assignedBy, D[NSF_ID])) elif grant['agency'] == 'NASA':
def new_email_triples(vcard_uri, email, g): new_email_uri = uri_gen('n', g) g.add((D[vcard_uri], VCARD.hasEmail, D[new_email_uri])) g.add((D[new_email_uri], RDF.type, VCARD.Email)) g.add((D[new_email_uri], RDF.type, VCARD.Work)) g.add((D[new_email_uri], VCARD.email, Literal(email)))
def new_telephone_triples(vcard_uri, phone, g): new_tele_uri = uri_gen('n', g) g.add((D[vcard_uri], VCARD.hasTelephone, D[new_tele_uri])) g.add((D[new_tele_uri], RDF.type, VCARD.Telephone)) g.add((D[new_tele_uri], VCARD.telephone, Literal(phone)))
log.debug('{} found in database as "{}" with uri ' '{}.'.format(name, nickname, per_uri['id'])) break if per_uri['id']: # Look up existing info per_info = get_person_info(per_uri['id']) if 'objectType' not in per_info: g.add((D[per_uri['id']], RDF.type, VLOCAL.UNAVCOEmployee)) log.info("{} {} found in database as non-employee, adding " "employee type .".format(first_name, last_name)) else: per_info = {} log.info(u'{} could not be found in the database.'.format(name)) per_uri['id'] = uri_gen('per', g) g.add((D[per_uri['id']], RDF.type, FOAF.Person)) g.add((D[per_uri['id']], RDF.type, VLOCAL.UNAVCOEmployee)) g.add((D[per_uri['id']], RDFS.label, Literal(', '.join([last_name, first_name])))) per_info = {'vcard': {'value': None}} per_info['vcard']['value'] = new_vcard(first_name, last_name, None, g) g.add((D[per_uri['id']], OBO.ARG_2000028, D[per_info['vcard']['value']])) vcard_uri = per_info['vcard']['value'].replace(D, '') current_employees.append(per_uri['id']) if title: title = title.strip()
def process_doi(doi, matchlist): # Grab full metadata for the doi in json format print('Processing {}'.format(doi)) attr = data_api_lookup(doi.replace('10.7283/','')) # Publication type; coming from UNAVCO data API so assume it's a dataset pubtype = VIVO.Dataset pub_uri = uri_gen('dat') # Article info if "title" in attr: title = attr['title'].strip() if 'INTERFEROGRAM' in title: g.add((D[pub_uri], EC.hasDatasetType, D['n803942'])) elif 'TLS' in title: g.add((D[pub_uri], EC.hasDatasetType, D['n471427'])) else: g.add((D[pub_uri], EC.hasDatasetType, D['n546123'])) else: title = None # Authors authors = parse_authors_datacite(attr['creators']) # Publication date pub_year = (attr['publicationYear'] if 'publicationYear' in attr else None) date_uri = uri_gen('n') g.add((D[pub_uri], VIVO.dateTimeValue, D[date_uri])) add_date(D[date_uri], pub_year, g) # Add things to the graph if pubtype: g.add((D[pub_uri], RDF.type, pubtype)) g.add((D[pub_uri], BIBO.doi, Literal(doi))) if title: g.add((D[pub_uri], RDFS.label, Literal(title))) # Loop through the list of authors, trying to check for existing # authors in the database if authors: for idx, (first_name, surname) in enumerate(authors): full_name = join_if_not_empty((first_name, surname)) rank = idx+1 if full_name in matchlist[0]: pos = matchlist[0].index(full_name) assign_authorship(matchlist[1][pos], g, pub_uri, full_name, matchlist, rank) else: roll = name_lookup(surname) matchlist = name_selecter(roll, full_name, g, first_name, surname, pub_uri, matchlist, rank) if "relatedIdentifiers" in attr: if attr['relatedIdentifiers']: print("Related DOIs: {}".format(attr['relatedIdentifiers'])) for rel_doi in attr['relatedIdentifiers']: if rel_doi in datasets_in_vivo[0]: rel_uri = (datasets_in_vivo[1] [datasets_in_vivo[0].index(rel_doi)]) # Try the local graph else: rel_uri = next(g.subjects(BIBO.doi, Literal(rel_doi)), None) # All related DOIs are assumed to be children if rel_uri: g.add((URIRef(rel_uri), OBO.BFO_0000050, D[pub_uri])) g.add((D[pub_uri], OBO.BFO_0000051, URIRef(rel_uri))) else: if pub_uri in orphans: orphans[pub_uri].append(rel_doi) else: orphans[pub_uri] = [rel_doi] if "relatedPublications" in attr: if attr['relatedPublications']: print("Found related pubs, but there isn't support for this (yet)") # print(attr['relatedPublications']) if "stationCode" in attr: if attr['stationCode']: # dataset obo:RO_0002353 station # station obo:RO_0002234 dataset if stations_in_vivo[attr['stationCode']]: g.add((D[pub_uri], OBO.RO_0002353, URIRef(stations_in_vivo[attr['stationCode']]))) g.add((URIRef(stations_in_vivo[attr['stationCode']]), OBO.RO_0002234, D[pub_uri])) else: print("Ruh roh, could not find URI for station {}".format( attr['stationCode'])) with open('matchlistfile.pickle', 'wb') as f: pickle.dump(matchlist, f)
break else: rep_name = vcard_uri = url = url_uri = url_rank = \ url_rank_datatype = rep_uri = None log.info(institution+' NOT found in the database. ') if args.auto_mode: user_input = '' else: user_input = raw_input('\n' + institution + ' not found ' 'in the database. Supply a URI ' '(e.g. org123456) or press Enter to' ' create a new organization.\n ') if user_input == '': # Create a new organization org_uri = uri_gen('org') role_uri = uri_gen('n') if 'University' in institution: g.add((D[org_uri], RDF.type, VIVO.University)) else: g.add((D[org_uri], RDF.type, FOAF.Organization)) g.add((D[org_uri], RDFS.label, Literal(institution))) g.add((D[org_uri], OBO.RO_0000053, D[role_uri])) if info['Type'] == 'Member Institution': g.add((D[role_uri], RDF.type, VIVO.MemberRole)) else: g.add((D[role_uri], RDF.type, VLOCAL.AssociateMemberRole)) g.add((D[role_uri], OBO.RO_0000052, D[org_uri])) g.add((D[role_uri], VIVO.roleContributesTo, D[UNAVCO_ID]))