def single_thread_harvest(): g = Graph() for item in client.filter_query(query): g += client.to_graph(item, models.Expertise) #print item.cid, item.name #print>>sys.stderr, "adding triples", len(g) backend.sync_updates(ng, g)
def generate_local_coauthor(): """ Run SPARQL query to generate a boolean indicating that the person has a local coauthor. """ logger.info("Generating local coauthor flag.") g = models.create_local_coauthor_flag() backend.sync_updates("http://localhost/data/local-coauthors", g)
def harvest_orgs(): """ Fetches all internal orgs and cards associated with those orgs. """ logger.info("Harvesting orgs.") g = get_orgs() #print g.serialize(format='n3') backend.sync_updates("http://localhost/data/orgs", g)
def clear_pub_cards(): """ Delete all the pubs-cards named graphs. """ # get pub cards cards = get_pub_cards() for card_uri, card in cards: g = Graph() backend.sync_updates("http://localhost/data/pubs-card-{}".format(card), g)
def generate_orgs_to_pubs(): """ Relate pubs to orgs through publication cards. """ top_org = "638881" internal_orgs_query = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Organisation" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <attribute operator="equals" argument="12000" name="intorext"/> <relation minCount="1" name="CARD_has_ORGA"> <attribute operator="equals" argument="12006" name="typeOfCard"/> </relation> </filter> </query> </data> """ pubs_for_org_query = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Publication" xmlns="http://converis/ns/filterengine" xmlns:ns2="http://converis/ns/sortingengine"> <return> <attributes> </attributes> </return> <and> <relation name="PUBL_has_CARD"> <relation relatedto="{}" name="CARD_has_ORGA"> </relation> </relation> </and> </filter> </query> </data> """ logger.info("Fetching orgs with pub cards:\n" + internal_orgs_query) orgs = [] for org in client.filter_query(internal_orgs_query): orgs.append(org.cid) org_set = set(orgs) logger.info("Relating {} orgs to pubs.".format(len(org_set))) g = Graph() for oid in org_set: if oid == top_org: continue logger.info("Processing orgs to pubs for org {}.".format(oid)) q = pubs_for_org_query.format(oid) for pub in client.filter_query(q): ouri = models.org_uri(oid) pub_uri = models.pub_uri(pub.cid) logger.debug("Orgs to pubs. Processing org {} pub {}.".format(oid, pub.cid)) g.add((ouri, VIVO.relates, pub_uri)) backend.sync_updates("http://localhost/data/org-pubs", g)
def harvest_areas(): """ Gets all areas, narrower terms and any researchers associated with it. ~ 367 """ logger.info("Harvesting areas.") a = get_areas() #print a.serialize(format='n3') backend.sync_updates("http://localhost/data/areas", a)
def single_thread_harvest_awards(sample=True): """ Fetch all news items """ logger.info("Harvesting Awards.") g = Graph() done = 0 for award in client.filter_query(query): g += client.to_graph(award, Award) done += 1 backend.sync_updates(NG, g)
def harvest(): """ Fetch all pics and write to file """ logger.info("Harvesting all pictures.") g = Graph() for per_pict in client.filter_query(QUERY): g += client.to_graph(per_pict, PersonPicture) logger.info("Picture harvest complete") if len(g) < 200: logger.error("Picture data incomplete. Not updating") else: backend.sync_updates(NG, g)
def single_thread_harvest(): """ Fetch all news items """ logger.info("Harvesting Teaching.") g = Graph() done = 0 for award in client.filter_query(query): g += client.to_graph(award, models.TeachingLecture) done += 1 #if (done >= 20): # break print g.serialize(format='turtle') backend.sync_updates(NG, g)
def single_thread_harvest(): """ Fetch all positions """ logger.info("Harvesting Positions.") g = Graph() done = 0 for pos in client.filter_query(query): g += client.to_graph(pos, models.Position) done += 1 if done > 100: import ipdb ipdb.set_trace() backend.sync_updates(NG, g)
def generate_authorships(): """ Run SPARQL query to generate authorships by joining on converis:pubCardId. """ g = Graph() for person_uri, card_id in models.get_pub_cards(): for pub in client.get_related_ids('Publication', card_id, 'PUBL_has_CARD'): pub_uri = models.pub_uri(pub) uri = models.hash_uri("authorship", person_uri.toPython() + pub_uri.toPython()) g.add((uri, RDF.type, VIVO.Authorship)) g.add((uri, VIVO.relates, person_uri)) g.add((uri, VIVO.relates, pub_uri)) backend.sync_updates("http://localhost/data/authorship", g)
def process_pub_card(card): """ Process publication card relations. We should maybe just generate the authorship here too and eliminate the need for the post-ingest query. """ logger.info("Fetching pubs for card {}.".format(card)) g = Graph() # Relate pub to card for pub in client.get_related_entities('Publication', card, 'PUBL_has_CARD'): pub_uri = models.pub_uri(pub.cid) g.add((pub_uri, CONVERIS.pubCardId, Literal(card))) g += client.to_graph(pub, models.Publication) backend.sync_updates("http://localhost/data/pubs-card-{}".format(card), g) return
def harvest_service(sample=False): """ Fetch all service items """ g = Graph() done = 0 for item in client.filter_query(service_q): #print item.cid logger.error(item.cid) g += client.to_graph(item, Service) done += 1 if (sample is True) and (done >= 100): break print g.serialize(format='n3') backend.sync_updates(NG, g)
def sample_harvest(): q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Publication" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> <attribute operator="equals" argument="10347" name="Publication type"/> </filter> </query> </data> """ logger.info("Starting sample publications harvest.") g = Graph() for item in client.filter_query(q): g += client.to_graph(item, models.Publication) # print g.serialize(format="turtle") # backend.sync_updates replaces the named graph with the incoming data - meaning any # data in the system that's not in the incoming data will be deleted # backend.post_updates will only update the entities that are in the incoming data - anything # else is left as it is. backend.sync_updates("http://localhost/data/sample-books", g)
def harvest_news(sample=False): """ Fetch all news items """ logger.info("Harvesting News.") q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="News" xmlns="http://converis/ns/filterengine" xmlns:sort="http://converis/ns/sortingengine"> </filter> </query> </data> """ g = Graph() done = 0 for news in client.filter_query(q): g += client.to_graph(news, models.News) done += 1 if (sample is True) and (done >= 20): break #print g.serialize(format='n3') backend.sync_updates("http://localhost/data/news", g)
def pub_harvest(): q = """ <data xmlns="http://converis/ns/webservice"> <query> <filter for="Publication" xmlns="http://converis/ns/filterengine" xmlns:ns2="http://converis/ns/sortingengine"> <and> <and> <relation direction="lefttoright" name="PUBL_has_CARD"> <relation direction="righttoleft" name="PERS_has_CARD"> <attribute argument="6019159" name="fhPersonType" operator="equals"/> </relation> </relation> </and> </and> </filter> </query> </data> """ g = Graph() for item in client.filter_query(q): g += client.to_graph(item, models.Publication) ng = "http://localhost/data/publications" backend.sync_updates(ng, g)
def related_videos(): """ Get videos related to people with positions in this org. """ q = models.rq_prefixes + """ CONSTRUCT { ?org fhd:video ?video . } WHERE { ?p a foaf:Person ; fhd:video ?video . ?p vivo:relatedBy ?position . ?position a vivo:Position ; vivo:relates ?p, ?org . ?org a fhd:Organization . } """ vstore = models.get_store() try: g = vstore.query(q) logger.info("Found {} org videos".format(len(g))) except ResultException: g = Graph() backend.sync_updates(VNG, g)
def sync_updates(self): logger.info("Syncing updates with {} triples.".format(len(self.graph))) backend.sync_updates(self.named_graph, self.graph)
def single_thread_harvest(): g = Graph() for item in client.filter_query(internal_orgs_query): g += client.to_graph(item, models.Organization) backend.sync_updates(NG, g)
def sync_updates(self, named_graph): if named_graph is None: raise Exception("No named graph provided") logger.info("Syncing updates with {} triples.".format(len(self.graph))) backend.sync_updates(named_graph, self.graph)
def single_thread_harvest(): g = Graph() for item in client.filter_query(query): g += client.to_graph(item, models.EducationTraining) backend.sync_updates(named_graph, g)
def harvest_people(sample=False): logger.info("Harvesting people.") p = get_people(sample=sample) #print p.serialize(format='n3') backend.sync_updates("http://localhost/data/people", p)