def generate_orgs_to_pubs(): """ Relate pubs to orgs through publication cards. """ top_org = "638881" internal_orgs_query = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Organisation" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <attribute operator="equals" argument="12000" name="intorext"/> <relation minCount="1" name="CARD_has_ORGA"> <attribute operator="equals" argument="12006" name="typeOfCard"/> </relation> </filter> </query> </data> """ pubs_for_org_query = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Publication" xmlns="http://converis/ns/filterengine" xmlns:ns2="http://converis/ns/sortingengine"> <return> <attributes> </attributes> </return> <and> <relation name="PUBL_has_CARD"> <relation relatedto="{}" name="CARD_has_ORGA"> </relation> </relation> </and> </filter> </query> </data> """ logger.info("Fetching orgs with pub cards:\n" + internal_orgs_query) orgs = [] for org in client.filter_query(internal_orgs_query): orgs.append(org.cid) org_set = set(orgs) logger.info("Relating {} orgs to pubs.".format(len(org_set))) g = Graph() for oid in org_set: if oid == top_org: continue logger.info("Processing orgs to pubs for org {}.".format(oid)) q = pubs_for_org_query.format(oid) for pub in client.filter_query(q): ouri = models.org_uri(oid) pub_uri = models.pub_uri(pub.cid) logger.debug("Orgs to pubs. Processing org {} pub {}.".format(oid, pub.cid)) g.add((ouri, VIVO.relates, pub_uri)) backend.sync_updates("http://localhost/data/org-pubs", g)
def build_short_url_index(): logger.info("Building people shortURL index.") people_query = """ <data xmlns="http://converis/ns/webservice"> <return> <attributes> <attribute name="shortURL"/> </attributes> </return> <query> <filter for="Person" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <and> <attribute operator="notequals" argument="" name="shortURL"/> </and> </filter> </query> </data> """ d = {} for item in client.filter_query(people_query): if validate_slug(item.shorturl) is False: logger.info("{} - {} is not a valid shortURL.".format(item.cid, item.shorturl)) continue d[item.cid] = item.shorturl logger.info("Building org shortURL index.") # orgs too org_query = """ <data xmlns="http://converis/ns/webservice"> <return> <attributes> <attribute name="shortURL"/> </attributes> </return> <query> <filter for="Organisation" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <and> <attribute operator="notequals" argument="" name="shortURL"/> </and> </filter> </query> </data> """ for item in client.filter_query(org_query): if validate_slug(item.shorturl) is False: logger.info("{} - {} is not a valid shortURL.".format(item.cid, item.shorturl)) continue d[item.cid] = item.shorturl # write to disk with open(SHORT_URLS, 'w+') as of: pickle.dump(d, of)
def get_people(sample=False): q = """ <data xmlns="http://converis/ns/webservice"> <return> <attributes> <attribute name="Short description"/> <attribute name="cfFamilyNames"/> <attribute name="cfFirstNames"/> <attribute name="middleName"/> <attribute name="email"/> <attribute name="ORCID"/> <attribute name="academicTitle"/> <attribute name="cfResInt"/> </attributes> </return> <query> <filter for="Person" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <and> <and> <attribute argument="12105" name="typeOfPerson" operator="equals"/> </and> </and> </filter> </query> </data> """ g = Graph() done = 0 for person in client.filter_query(q): g += client.to_graph(person, models.Person) done += 1 if sample is True: if done >= 10: break return g
def single_thread_harvest(): g = Graph() for item in client.filter_query(query): g += client.to_graph(item, models.ClinicalTrial) #print item.cid, item.name #print>>sys.stderr, "adding triples", len(g) backend.sync_updates(ng, g)
def get_pub_cards(): q = """ <data xmlns="http://converis/ns/webservice"> <return> <attributes> </attributes> </return> <query> <filter for="Card" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <and> <and> <relation minCount="1" name="PUBL_has_CARD"/> </and> <!-- <and> <relation name="PUBL_has_CARD"> <attribute argument="2014" name="publYear" operator="greaterequal"/> </relation> </and> --> </and> </filter> </query> </data> """ g = Graph() done = 0 for card in client.filter_query(q): g += models.pub_to_card(card.cid) done += 1 if (done % 200) == 0: logging.info("Publications fetched: {}.".format(done)) return g
def harvest_journals(): """ Fetch all journals with pubs """ logger.info("Harvesting journals.") q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Journal" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <and> <and> <relation minCount="1" name="PUBL_has_JOUR"/> </and> </and> </filter> </query> </data> """ g = Graph() done = 0 for pub in client.filter_query(q): g += client.to_graph(pub, models.Journal) done += 1 #print g.serialize(format='n3') backend.sync_updates("http://localhost/data/journals", g)
def single_thread_harvest(): ng = "http://localhost/data/people" g = Graph() for ety in client.filter_query(query): item = client.Entity('Person', ety.cid) # FH people only if hasattr(item, 'fhpersontype'): if item.fhpersontype['cid'] == '6019159': g += client.to_graph(item, models.Person) backend.post_updates(ng, g)
def single_thread_harvest_awards(sample=True): """ Fetch all news items """ logger.info("Harvesting Awards.") g = Graph() done = 0 for award in client.filter_query(query): g += client.to_graph(award, Award) done += 1 backend.sync_updates(NG, g)
def harvest(): """ Fetch all pics and write to file """ logger.info("Harvesting all pictures.") g = Graph() for per_pict in client.filter_query(QUERY): g += client.to_graph(per_pict, PersonPicture) logger.info("Picture harvest complete") if len(g) < 200: logger.error("Picture data incomplete. Not updating") else: backend.sync_updates(NG, g)
def get_areas(): q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Area" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> </filter> </query> </data> """ g = Graph() for done, area in enumerate(client.filter_query(q)): g += client.to_graph(area, models.Expertise) return g
def single_thread_harvest(): """ Fetch all news items """ logger.info("Harvesting Teaching.") g = Graph() done = 0 for award in client.filter_query(query): g += client.to_graph(award, models.TeachingLecture) done += 1 #if (done >= 20): # break print g.serialize(format='turtle') backend.sync_updates(NG, g)
def single_thread_harvest(): """ Fetch all positions """ logger.info("Harvesting Positions.") g = Graph() done = 0 for pos in client.filter_query(query): g += client.to_graph(pos, models.Position) done += 1 if done > 100: import ipdb ipdb.set_trace() backend.sync_updates(NG, g)
def harvest_service(sample=False): """ Fetch all service items """ g = Graph() done = 0 for item in client.filter_query(service_q): #print item.cid logger.error(item.cid) g += client.to_graph(item, Service) done += 1 if (sample is True) and (done >= 100): break print g.serialize(format='n3') backend.sync_updates(NG, g)
def sample_harvest(): q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Publication" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <attribute operator="equals" argument="10347" name="Publication type"/> </filter> </query> </data> """ logger.info("Starting sample publications harvest.") g = Graph() for item in client.filter_query(q): g += client.to_graph(item, models.Publication) # print g.serialize(format="turtle") # backend.sync_updates replaces the named graph with the incoming data - meaning any # data in the system that's not in the incoming data will be deleted # backend.post_updates will only update the entities that are in the incoming data - anything # else is left as it is. backend.sync_updates("http://localhost/data/sample-books", g)
def get_orgs(): internal = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Organisation" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <and> <attribute argument="12000" name="intOrExt" operator="equals"/> </and> </filter> </query> </data> """ g = Graph() done = 0 for q in [internal]: for org in client.filter_query(q): #if g.value(predicate=CONVERIS.converisId, object=Literal(org.cid)) is None: # logging.debug("Mapping org {}.".format(org.cid)) g += client.to_graph(org, models.Organization) done += 1 return g
def harvest_news(sample=False): """ Fetch all news items """ logger.info("Harvesting News.") q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="News" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> </filter> </query> </data> """ g = Graph() done = 0 for news in client.filter_query(q): g += client.to_graph(news, models.News) done += 1 if (sample is True) and (done >= 20): break #print g.serialize(format='n3') backend.sync_updates("http://localhost/data/news", g)
def harvest_updates(days=2, test=False): """ Fetch updated pics and write to file. Default to days as 2 so that we get yesterday's date. """ updated_date = days_ago(days) logger.info("Harvesting updated pictures since {}.".format(updated_date)) query = QUERY.replace("2000-01-01", updated_date) g = Graph() done = 0 for pict in client.filter_query(query): g += client.to_graph(pict, PersonPicture) done += 1 if test is True: if done > 10: break if len(g) > 0: backend.post_updates(NG, g) logger.info( "Updated picture harvest complete. Updated: {}".format(done)) else: logger.info("No updated pictures found.")
def pub_harvest(): q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Publication" xmlns="http://converis/ns/filterengine" xmlns:ns2="http://converis/ns/sortingengine"> <and> <and> <relation direction="lefttoright" name="PUBL_has_CARD"> <relation direction="righttoleft" name="PERS_has_CARD"> <attribute argument="6019159" name="fhPersonType" operator="equals"/> </relation> </relation> </and> </and> </filter> </query> </data> """ g = Graph() for item in client.filter_query(q): g += client.to_graph(item, models.Publication) ng = "http://localhost/data/publications" backend.sync_updates(ng, g)
def get_pubs(): q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Publication" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <and> <and> <relation minCount="1" name="PUBL_has_CARD"/> </and> <and> <attribute argument="2009" name="publYear" operator="greaterequal"/> </and> </and> </filter> </query> </data> """ g = Graph() done = 0 for pub in client.filter_query(q): g += client.to_graph(pub, models.Publication) done += 1 return g
def single_thread_harvest(): g = Graph() for item in client.filter_query(internal_orgs_query): g += client.to_graph(item, models.Organization) backend.sync_updates(NG, g)
def single_thread_harvest(): g = Graph() for item in client.filter_query(query): g += client.to_graph(item, models.EducationTraining) backend.sync_updates(named_graph, g)