def add_get_gbif_link_entity(raw_uri): """Gets or adds a link_entity for a GBIF URI""" species_id = get_gbif_species_id_from_uri(raw_uri) uri = GBIF_BASE_URI + str(species_id) le = LinkEntity.objects.filter(uri=uri).first() if le: # Already in the database return le api = gbifAPI() can_name = api.get_gbif_cannonical_name(species_id) vern_name = api.get_gbif_vernacular_name(species_id) if not vern_name: vern_name = can_name print('Saving {} as {}, {}'.format( uri, can_name, vern_name, )) le = LinkEntity() le.uri = uri le.label = can_name le.alt_label = vern_name le.vocab_uri = GBIF_VOCAB_URI le.ent_type = 'class' le.sort = '' le.save() return le
def add_missing_containing_regions(project_uuid='0', source_id=SOURCE_ID): """Adds missing containing regions that have site counts""" for state, state_uuid, new_region, new_uuid, geonames_uri in ADD_REGIONS: row = { 'parent_uuid': state_uuid, 'context_uuid': new_uuid, 'label': new_region, 'class_uri': 'oc-gen:cat-region', } load_context_row(project_uuid=project_uuid, source_id=source_id, row=row) ent_exists = LinkEntity.objects.filter(uri=geonames_uri).first() if not ent_exists: ent = LinkEntity() ent.uri = geonames_uri ent.label = new_region ent.alt_label = new_region ent.vocab_uri = GeonamesAPI().VOCAB_URI ent.ent_type = 'class' ent.save() la_exists = LinkAnnotation.objects.filter( subject=new_uuid, object_uri=geonames_uri).first() if not la_exists: new_la = LinkAnnotation() new_la.subject = new_uuid new_la.subject_type = 'subjects' new_la.project_uuid = project_uuid new_la.source_id = source_id new_la.predicate_uri = 'skos:closeMatch' new_la.object_uri = geonames_uri new_la.creator_uuid = '' new_la.save()
def save_entity_labels(self): """ saves labels of entities in a vocabulary """ output = False if self.graph is not False and self.vocabulary_uri is not False: output = [] if self.replace_old: LinkEntity.objects.filter(vocab_uri=self.vocabulary_uri).delete() for s, p, o in self.graph.triples((None, RDFS.label, None)): subject_uri = s.__str__() # get the URI of the subject as a string label = o.__str__() # get the Label of from object as a string le_ents = LinkEntity.objects.filter(uri=subject_uri)[:1] if len(le_ents) < 1 or self.replace_old: newr = LinkEntity() newr.uri = subject_uri newr.label = label newr.alt_label = label newr.ent_type = 'class' newr.vocab_uri = self.vocabulary_uri newr.save() act_t = {'s': subject_uri, 'o': label} output.append(act_t) return output
def link_sites_from_filecache(self): """ updates Open Context to save new sites and annotations from the file cache """ if self.filecache is not None: # print('Cache update !: ' + self.cache_filekey) self.filecache.working_dir = self.working_dir json_obj = self.filecache.get_dict_from_file(self.cache_filekey) if isinstance(json_obj, dict): if 'trinomial_refs' in json_obj: for tri_ref in json_obj['trinomial_refs']: uri = tri_ref['rec_uri'] title = tri_ref['title'] if len(title) > 194: title = title[0:190] + '... ' l_exists = LinkEntity.objects.filter(uri=uri)[:1] if len(l_exists) < 1: l_ent = LinkEntity() l_ent.uri = uri l_ent.label = title l_ent.alt_label = title l_ent.vocab_uri = tri_ref['source_uri'] l_ent.ent_type = 'class' l_ent.save() for trinomial in tri_ref['trinomials']: man_objs = Manifest.objects.filter( label=trinomial, class_uri='oc-gen:cat-site') if len(man_objs) > 0: man_obj = man_objs[0] la = LinkAnnotation() la.subject = uri # the subordinate is the subject la.subject_type = 'uri' la.project_uuid = man_obj.project_uuid la.source_id = self.source_id la.predicate_uri = "skos:broader" la.object_uri = tri_ref['source_uri'] la.save() try: la.save() except: pass links = LinkAnnotation.objects\ .filter(subject=man_obj.uuid, object_uri=uri)[:1] if len(links) < 1: print('Link ' + man_obj.label + ' (' + man_obj.uuid + ') to ' + uri) la = LinkAnnotation() la.subject = man_obj.uuid # the subordinate is the subject la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = self.source_id la.predicate_uri = 'dc-terms:isReferencedBy' la.object_uri = uri la.save() try: la.save() except: pass
def save_csv_from_filecache(self): """ updates Open Context to save new sites and annotations from the file cache """ if self.filecache is not None: # print('Cache update !: ' + self.cache_filekey) self.filecache.working_dir = self.working_dir json_obj = self.filecache.get_dict_from_file(self.cache_filekey) filename = self.cache_filekey + '.csv' directory = self.filecache.prep_directory(self.working_dir) dir_filename = os.path.join(directory, filename) if isinstance(json_obj, dict): if 'trinomial_refs' in json_obj: field_name_row = [ 'County Code', 'County Name', 'Trinomial', 'Citation', 'URI', 'Title', 'Note' ] f = codecs.open(dir_filename, 'w', encoding='utf-8') writer = csv.writer(f, dialect=csv.excel, quoting=csv.QUOTE_ALL) writer.writerow(field_name_row) for tri_ref in json_obj['trinomial_refs']: citation = tri_ref['citation_html'] uri = tri_ref['rec_uri'] title = tri_ref['title'] if len(title) > 194: title = title[0:190] + '... ' l_exists = LinkEntity.objects.filter(uri=uri)[:0] if len(l_exists) < 1: l_ent = LinkEntity() l_ent.uri = uri l_ent.label = title l_ent.alt_label = title l_ent.vocab_uri = tri_ref['source_uri'] l_ent.ent_type = 'class' l_ent.save() if 'note' in tri_ref: note = tri_ref['note'] else: note = '' for trinomial in tri_ref['trinomials']: county_code = trinomial[0:4] act_county_name = None for key, county_name in self.COUNTY_PREFIXES.items( ): if county_code == key: act_county_name = county_name break print('County code: ' + county_code + ' is ' + str(act_county_name)) row = [ county_code, act_county_name, trinomial, citation, uri, title, note ] writer.writerow(row) print('Done!') f.closed
def find_related_geonames(self, username='******'): """ Adds geonames spatial data for items with geonames annotations """ man_objs = Manifest.objects\ .filter(project_uuid='0', class_uri='oc-gen:cat-region', item_type='subjects') for man_obj in man_objs: print('Checking slug: ' + man_obj.slug) subj_obj = Subject.objects.get(uuid=man_obj.uuid) context = subj_obj.context if '/' in context: cont_ex = context.split('/') admin_level = len(cont_ex) - 1 if admin_level < 0: admin_level = 0 else: admin_level = 0 q_str = context.replace('/', ' ') geo_api = GeonamesAPI() json_r = geo_api.search_admin_entity(q_str, admin_level, username) if isinstance(json_r, dict): # we found a result from GeoNames! print('Geonames result found.') if 'geonames' in json_r: if len(json_r['geonames']) > 0: # we've got a result geo_id = json_r['geonames'][0]['geonameId'] label = json_r['geonames'][0]['name'] alt_label = json_r['geonames'][0]['toponymName'] geonames_uri = 'http://www.geonames.org/' + str(geo_id) l_ents = LinkEntity.objects\ .filter(uri=geonames_uri)[:1] if len(l_ents) < 1: # we need to create this entity ent = LinkEntity() ent.uri = geonames_uri ent.label = label ent.alt_label = alt_label ent.vocab_uri = GeonamesAPI().VOCAB_URI ent.ent_type = 'class' ent.save() print(geonames_uri) annos = LinkAnnotation.objects\ .filter(subject=man_obj.uuid, object_uri=geonames_uri)[:1] if len(annos) < 1: # we need to add the annotation linking this item print('Adding new annotation!') new_la = LinkAnnotation() new_la.subject = man_obj.uuid new_la.subject_type = man_obj.item_type new_la.project_uuid = man_obj.project_uuid new_la.source_id = man_obj.source_id new_la.predicate_uri = 'skos:closeMatch' new_la.object_uri = geonames_uri new_la.creator_uuid = '' new_la.save() else: print('Relation already known.')
def link_sites_from_filecache(self): """ updates Open Context to save new sites and annotations from the file cache """ if self.filecache is not None: # print('Cache update !: ' + self.cache_filekey) self.filecache.working_dir = self.working_dir json_obj = self.filecache.get_dict_from_file(self.cache_filekey) if isinstance(json_obj, dict): if 'trinomial_refs' in json_obj: for tri_ref in json_obj['trinomial_refs']: uri = tri_ref['rec_uri'] title = tri_ref['title'] if len(title) > 194: title = title[0:190] + '... ' l_exists = LinkEntity.objects.filter(uri=uri)[:1] if len(l_exists) < 1: l_ent = LinkEntity() l_ent.uri = uri l_ent.label = title l_ent.alt_label = title l_ent.vocab_uri = tri_ref['source_uri'] l_ent.ent_type = 'class' l_ent.save() for trinomial in tri_ref['trinomials']: man_objs = Manifest.objects.filter(label=trinomial, class_uri='oc-gen:cat-site') if len(man_objs) > 0: man_obj = man_objs[0] la = LinkAnnotation() la.subject = uri # the subordinate is the subject la.subject_type = 'uri' la.project_uuid = man_obj.project_uuid la.source_id = self.source_id la.predicate_uri = "skos:broader" la.object_uri = tri_ref['source_uri'] la.save() try: la.save() except: pass links = LinkAnnotation.objects\ .filter(subject=man_obj.uuid, object_uri=uri)[:1] if len(links) < 1: print('Link ' + man_obj.label + ' (' + man_obj.uuid + ') to ' + uri) la = LinkAnnotation() la.subject = man_obj.uuid # the subordinate is the subject la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = self.source_id la.predicate_uri = 'dc-terms:isReferencedBy' la.object_uri = uri la.save() try: la.save() except: pass
def make_dinaa_link_assertions(self): """ makes assertions to relate DINAA URIs with federal registry documents """ self.make_fed_reg_vocab_entity() fed_api = FederalRegistryAPI() search_key_list = fed_api.get_list_cached_keyword_searches() dinaa_matches = fed_api.get_dict_from_file(self.dinaa_matches_key) for s_key in search_key_list: s_json = fed_api.get_dict_from_file(s_key) if 'results' in s_json: for match in dinaa_matches: for s_result in s_json['results']: if s_result['document_number'] == match['doc']: print('Found match for ' + match['doc']) man_obj = False try: man_obj = Manifest.objects.get( uuid=match['uuid']) except Manifest.DoesNotExist: man_obj = False if man_obj is not False: fed_uri = s_result['html_url'] le_check = False try: le_check = LinkEntity.objects.get( uri=fed_uri) except LinkEntity.DoesNotExist: le_check = False if le_check is False: print('Saving entity: ' + s_result['title']) title = s_result['title'] if len(title) > 175: title = title[0:175] + '...' le = LinkEntity() le.uri = fed_uri le.label = title le.alt_label = s_result['document_number'] le.vocab_uri = self.FEDERAL_REG_URI le.ent_type = 'instance' le.slug = 'fed-reg-docs-' + s_result[ 'document_number'] le.save() # Now save the link annotation print('Adding ref link to ' + man_obj.label) la = LinkAnnotation() la.subject = man_obj.uuid la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = self.source_id la.predicate_uri = self.DC_TERMS_REF_BY la.object_uri = fed_uri try: la.save() except: pass
def prepare(self): """ checks to make sure the referrer actually exists in the database """ if self.uri is not False: ent = Entity() found = ent.dereference(self.uri) if found: self.label = ent.label self.alt_label = ent.alt_label self.vocab_label = ent.vocabulary self.vocab_uri = ent.vocab_uri self.valid = True else: # the referring source is not known in the database if self.vocab_uri is not False\ and self.vocab_label is not False\ and self.label is not False: # we have enough data to save a referrer in the database referrer_ent_type = 'vocabulary' if self.vocab_uri != self.uri: referrer_ent_type = 'class' ent_v = Entity() found_v = ent_v.dereference(self.vocab_uri) if found_v is False: # the referring vocabulary is not known in the database # so we need to create it lev = LinkEntity() lev.uri = self.vocab_uri lev.label = self.vocab_label lev.alt_label = self.vocab_alt_label lev.vocab_uri = self.vocab_uri lev.ent_type = 'vocabulary' lev.save() # now are ready to make a linked entity for the referrer le = LinkEntity() le.uri = self.uri le.label = self.label le.alt_label = self.alt_label le.vocab_uri = self.vocab_uri le.ent_type = referrer_ent_type le.save() self.valid = True
def make_dinaa_link_assertions(self): """ makes assertions to relate DINAA URIs with federal registry documents """ self.make_fed_reg_vocab_entity() fed_api = FederalRegistryAPI() search_key_list = fed_api.get_list_cached_keyword_searches() dinaa_matches = fed_api.get_dict_from_file(self.dinaa_matches_key) for s_key in search_key_list: s_json = fed_api.get_dict_from_file(s_key) if 'results' in s_json: for match in dinaa_matches: for s_result in s_json['results']: if s_result['document_number'] == match['doc']: print('Found match for ' + match['doc']) man_obj = False try: man_obj = Manifest.objects.get(uuid=match['uuid']) except Manifest.DoesNotExist: man_obj = False if man_obj is not False: fed_uri = s_result['html_url'] le_check = False try: le_check = LinkEntity.objects.get(uri=fed_uri) except LinkEntity.DoesNotExist: le_check = False if le_check is False: print('Saving entity: ' + s_result['title']) title = s_result['title'] if len(title) > 175: title = title[0:175] + '...' le = LinkEntity() le.uri = fed_uri le.label = title le.alt_label = s_result['document_number'] le.vocab_uri = self.FEDERAL_REG_URI le.ent_type = 'instance' le.slug = 'fed-reg-docs-' + s_result['document_number'] le.save() # Now save the link annotation print('Adding ref link to ' + man_obj.label) la = LinkAnnotation() la.subject = man_obj.uuid la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = self.source_id la.predicate_uri = self.DC_TERMS_REF_BY la.object_uri = fed_uri try: la.save() except: pass
def check_add_periodo_vocab(self): """ Adds the periodo vocabulary if it doesn't exist yet """ lev = LinkEntity.objects.filter(uri=self.PERIODO_VOCAB_URI)[:1] if len(lev) < 1: le = LinkEntity() le.uri = self.PERIODO_VOCAB_URI le.label = 'PeriodO' le.alt_label = 'PeriodO (http://perio.do)' le.vocab_uri = self.PERIODO_VOCAB_URI le.ent_type = 'vocabulary' le.save()
def validate_fix_eol_objects(self): """ Searches for EOL links in the LinkAnnotations table, then fixes badly URIs with cruft. Also calls the EOL API to get labels for URIs with no record in the LinkEntity table. """ checked_uris = [] eol_las = LinkAnnotation.objects\ .filter(object_uri__icontains=self.EOL_URI_PREFIX) for eol_la in eol_las: eol_uri = eol_la.object_uri leg = LinkEntityGeneration() le_gen = LinkEntityGeneration() eol_uri = le_gen.make_clean_uri( eol_uri) # strip off any cruft in the URI if eol_uri != eol_la.object_uri: print('Has cruft: ' + str(eol_la.object_uri)) LinkAnnotation.objects\ .filter(hash_id=eol_la.hash_id)\ .delete() # delete the old eol_la.object_uri = eol_uri eol_la.save() # save the cleaned URI if eol_uri not in checked_uris: # only check on a given URI once checked_uris.append(eol_uri) try: le = LinkEntity.objects.get(uri=eol_uri) except LinkEntity.DoesNotExist: le = False if le is False: print('Getting missing data for: ' + eol_uri) label = False eol_api = eolAPI() eol_data = eol_api.get_basic_json_for_eol_uri(eol_uri) if isinstance(eol_data, dict): print('Reading data...') if 'scientificName' in eol_data: label = eol_data['scientificName'] else: print('Failed to read data: ' + str(eol_data)) if label is not False: print('Saving data for: ' + str(label) + ' (' + eol_uri + ')') le = LinkEntity() le.uri = eol_uri le.label = label le.alt_label = label le.ent_type = 'class' le.vocab_uri = self.EOL_VOCAB_URI le.save()
def check_add_period_pred(self): """ Adds the periodo vocabulary if it doesn't exist yet """ temporal_pred = 'http://purl.org/dc/terms/temporal' lev = LinkEntity.objects.filter(uri=temporal_pred)[:1] if len(lev) < 1: le = LinkEntity() le.uri = temporal_pred le.label = 'Temporal Coverage' le.alt_label = 'Temporal Coverage' le.vocab_uri = 'http://purl.org/dc/terms' le.ent_type = 'property' le.save()
def make_fed_reg_vocab_entity(self): """ makes a vocabulary entity for the federal registry """ try: le_check = LinkEntity.objects.get(uri=self.FEDERAL_REG_URI) except LinkEntity.DoesNotExist: le_check = False if le_check is False: le = LinkEntity() le.uri = self.FEDERAL_REG_URI le.label = self.FEDERAL_REG_LABEL le.alt_label = self.FEDERAL_REG_LABEL le.vocab_uri = self.FEDERAL_REG_URI le.ent_type = 'vocabulary' le.slug = 'fed-reg' le.save()
def validate_fix_eol_objects(self): """ Searches for EOL links in the LinkAnnotations table, then fixes badly URIs with cruft. Also calls the EOL API to get labels for URIs with no record in the LinkEntity table. """ checked_uris = [] eol_las = LinkAnnotation.objects.filter(object_uri__icontains=self.EOL_URI_PREFIX) for eol_la in eol_las: eol_uri = eol_la.object_uri leg = LinkEntityGeneration() le_gen = LinkEntityGeneration() eol_uri = le_gen.make_clean_uri(eol_uri) # strip off any cruft in the URI if eol_uri != eol_la.object_uri: print("Has cruft: " + str(eol_la.object_uri)) LinkAnnotation.objects.filter(hash_id=eol_la.hash_id).delete() # delete the old eol_la.object_uri = eol_uri eol_la.save() # save the cleaned URI if eol_uri not in checked_uris: # only check on a given URI once checked_uris.append(eol_uri) try: le = LinkEntity.objects.get(uri=eol_uri) except LinkEntity.DoesNotExist: le = False if le is False: print("Getting missing data for: " + eol_uri) label = False eol_api = eolAPI() eol_data = eol_api.get_basic_json_for_eol_uri(eol_uri) if isinstance(eol_data, dict): print("Reading data...") if "scientificName" in eol_data: label = eol_data["scientificName"] else: print("Failed to read data: " + str(eol_data)) if label is not False: print("Saving data for: " + str(label) + " (" + eol_uri + ")") le = LinkEntity() le.uri = eol_uri le.label = label le.alt_label = label le.ent_type = "class" le.vocab_uri = self.EOL_VOCAB_URI le.save()
def validate_fix_uberon_objects(self): """ Searches for UBERON links in the LinkAnnotations table, then fixes badly URIs with cruft. Also calls the UBERON API to get labels for URIs with no record in the LinkEntity table. """ checked_uris = [] uberon_las = LinkAnnotation.objects\ .filter(object_uri__icontains=self.UBERON_URI_PREFIX) for uberon_la in uberon_las: uberon_uri = uberon_la.object_uri le_gen = LinkEntityGeneration() uberon_uri = le_gen.make_clean_uri( uberon_uri) # strip off any cruft in the URI if uberon_uri != uberon_la.object_uri: print('Has cruft: ' + str(uberon_la.object_uri)) LinkAnnotation.objects\ .filter(hash_id=uberon_la.hash_id)\ .delete() # delete the old uberon_la.object_uri = uberon_uri uberon_la.save() # save the cleaned URI if uberon_uri not in checked_uris: # only check on a given URI once checked_uris.append(uberon_uri) try: le = LinkEntity.objects.get(uri=uberon_uri) except LinkEntity.DoesNotExist: le = False if le is False: print('Getting missing data for: ' + uberon_uri) u_api = uberonAPI() label = u_api.get_uri_label_from_graph(uberon_uri) if label is False: print('Failed to read data for : ' + str(uberon_uri)) else: print('Saving data for: ' + str(label) + ' (' + uberon_uri + ')') le = LinkEntity() le.uri = uberon_uri le.label = label le.alt_label = label le.ent_type = 'class' le.vocab_uri = self.UBERON_VOCAB_URI le.save()
def check_add_period(self, p_ref): """ Checks to see if a period collection is in the database, adds it if needed """ if not p_ref['period-meta']['uri'] in self.db_uris: # not in memory for being in the database lev = LinkEntity.objects.filter( uri=p_ref['period-meta']['uri'])[:1] if len(lev) < 1 or self.update_period: le = LinkEntity() le.uri = p_ref['period-meta']['uri'] le.label = p_ref['period-meta']['label-range'] le.alt_label = p_ref['period-meta']['label'] le.vocab_uri = p_ref['collection']['uri'] le.ent_type = 'class' le.save() self.db_uris.append(p_ref['period-meta']['uri'])
def check_add_period_collection(self, p_ref): """ Checks to see if a period collection is in the database, adds it if needed """ if not p_ref['collection']['uri'] in self.db_uris: # not in memory for being in the database lev = LinkEntity.objects.filter(uri=p_ref['collection']['uri'])[:1] if len(lev) < 1: le = LinkEntity() le.uri = p_ref['collection']['uri'] le.label = 'PeriodO Collection: ' + p_ref['collection']['label'] le.alt_label = 'PeriodO (http://perio.do): ' + p_ref[ 'collection']['label'] le.vocab_uri = self.PERIODO_VOCAB_URI le.ent_type = 'vocabulary' le.save() self.db_uris.append(p_ref['collection']['uri'])
def check_add_period_collection(self, p_ref): """ Checks to see if a period collection is in the database, adds it if needed """ if isinstance(p_ref, dict): uri = PeriodoAPI.URI_PREFIX + p_ref['id'] if not uri in self.db_uris: # not in memory for being in the database lev = LinkEntity.objects.filter(uri=uri)[:1] if len(lev) < 1 or self.update_period: le = LinkEntity() le.uri = uri le.label = 'PeriodO Collection: ' + p_ref['source']['title'] le.alt_label = 'PeriodO (http://perio.do): ' + p_ref['source']['title'] le.vocab_uri = self.PERIODO_VOCAB_URI le.ent_type = 'vocabulary' le.save() self.db_uris.append(uri)
def check_add_period(self, p_ref, vocab_uri): """ Checks to see if a period collection is in the database, adds it if needed """ if isinstance(p_ref, dict): uri = PeriodoAPI.URI_PREFIX + p_ref['id'] if not uri in self.db_uris: # not in memory for being in the database lev = LinkEntity.objects.filter(uri=uri)[:1] if len(lev) < 1 or self.update_period: le = LinkEntity() le.uri = uri le.label = p_ref['label'] le.alt_label = p_ref['alt_label'] le.vocab_uri = vocab_uri le.ent_type = 'class' le.save() self.db_uris.append(uri)
def validate_fix_uberon_objects(self): """ Searches for UBERON links in the LinkAnnotations table, then fixes badly URIs with cruft. Also calls the UBERON API to get labels for URIs with no record in the LinkEntity table. """ checked_uris = [] uberon_las = LinkAnnotation.objects\ .filter(object_uri__icontains=self.UBERON_URI_PREFIX) for uberon_la in uberon_las: uberon_uri = uberon_la.object_uri le_gen = LinkEntityGeneration() uberon_uri = le_gen.make_clean_uri(uberon_uri) # strip off any cruft in the URI if uberon_uri != uberon_la.object_uri: print('Has cruft: ' + str(uberon_la.object_uri)) LinkAnnotation.objects\ .filter(hash_id=uberon_la.hash_id)\ .delete() # delete the old uberon_la.object_uri = uberon_uri uberon_la.save() # save the cleaned URI if uberon_uri not in checked_uris: # only check on a given URI once checked_uris.append(uberon_uri) try: le = LinkEntity.objects.get(uri=uberon_uri) except LinkEntity.DoesNotExist: le = False if le is False: print('Getting missing data for: ' + uberon_uri) u_api = uberonAPI() label = u_api.get_uri_label_from_graph(uberon_uri) if label is False: print('Failed to read data for : ' + str(uberon_uri)) else: print('Saving data for: ' + str(label) + ' (' + uberon_uri + ')') le = LinkEntity() le.uri = uberon_uri le.label = label le.alt_label = label le.ent_type = 'class' le.vocab_uri = self.UBERON_VOCAB_URI le.save()
def save_entity_labels(self): """ saves labels of entities in a vocabulary """ output = False if (self.graph is not False and self.vocabulary_uri is not False): output = [] if (self.replace_old): LinkEntity.objects.filter( vocab_uri=self.vocabulary_uri).delete() for s, p, o in self.graph.triples((None, RDFS.label, None)): subject_uri = s.__str__( ) # get the URI of the subject as a string label = o.__str__() # get the Label of the object as a string newr = LinkEntity() newr.uri = subject_uri newr.label = label newr.alt_label = label newr.ent_type = 'type' newr.vocab_uri = self.vocabulary_uri newr.save() act_t = {'s': subject_uri, 'o': label} output.append(act_t) return output
def get_save_entity_label(self, eol_uri): """ gets the entity label from the EOL API and saves it """ label = False eol_api = eolAPI() eol_data = eol_api.get_basic_json_for_eol_uri(eol_uri) if isinstance(eol_data, dict): print('Reading data...') if 'scientificName' in eol_data: label = eol_data['scientificName'] else: print('Failed to read data: ' + str(eol_data)) if label is not False: print('Saving data for: ' + str(label) + ' (' + eol_uri + ')') le = LinkEntity() le.uri = eol_uri le.label = label le.alt_label = label le.ent_type = 'class' le.vocab_uri = self.EOL_VOCAB_URI le.save()
def add_update(self, post_data): """ Creates or updates a linked data entity """ ok = True uri = False label = False vocab_uri = False alt_label = False ent_type = 'class' note = '' action = 'attempted creation or update' sent_uri = uri sent_label = label if 'uri' in post_data: uri = post_data['uri'] sent_uri = uri if not self.validate_web_uri(uri): # must be a full web uri to use note += '"' + uri + '" needs to be valid Web URI. ' uri = False if 'label' in post_data: label = post_data['label'] sent_label = label alt_label = label # default for alt-label is label if len(label) < 1: note += 'The entity label cannot be blank. ' label = False if 'alt_label' in post_data: if len(post_data['alt_label']) > 0: alt_label = post_data['alt_label'] if 'ent_type' in post_data: ent_type = post_data['ent_type'] if 'vocab_uri' in post_data: vocab_uri = post_data['vocab_uri'] if not self.validate_web_uri(vocab_uri)\ and ent_type != 'vocabulary': # vocab_uri is not a full uri, so suggest one # based on the URI for the request vocab_uri = self.suggest_vocabulary(uri) elif not self.validate_web_uri(vocab_uri)\ and ent_type == 'vocabulary': vocab_uri = uri else: pass if uri is not False \ and label is not False \ and vocab_uri is not False: le_gen = LinkEntityGeneration() uri = le_gen.make_clean_uri(uri) if uri != vocab_uri: # get the varient of the vocab_uri that's actually in use # returns false if a varient can't be found vocab_uri = self.check_vocab_uri(vocab_uri) if vocab_uri is False: # cannot find a varient for this vocabulary uri vocab_ok = False else: vocab_ok = True elif ent_type == 'vocabulary': vocab_ok = True else: vocab_ok = False if vocab_ok: ok = True try: action = 'edit-update' le = LinkEntity.objects.get(uri=uri) except LinkEntity.DoesNotExist: action = 'add-create' le = LinkEntity() le.uri = uri # now add information to save le.label = label le.alt_label = alt_label le.ent_type = ent_type le.vocab_uri = vocab_uri le.save() uri = le.uri # in case the URI changed because of validation changes else: ok = False note += 'Must first create a record for the vocabulary. ' else: ok = False note += 'Missing data required for this action. ' self.response = { 'action': action, 'uri': sent_uri, 'label': sent_label, 'ok': ok, 'change': { 'note': note } } return self.response
def check_add_link_entity(self, uri): """ checkes to see if an entity exists, if not, it adds it if we recognize the URI to be part of a known vocabulary """ try: act_ent = LinkEntity.objects.get(uri=uri) except LinkEntity.DoesNotExist: act_ent = False if act_ent is False: label = False alt_label = False ent_type = 'class' vocab_uri = False if '.geonames.org' in uri: geo_api = GeonamesAPI() vocab_uri = GeonamesAPI().VOCAB_URI labels = geo_api.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] elif 'UBERON' in uri: uber_api = uberonAPI() vocab_uri = uberonAPI().VOCAB_URI label = uber_api.get_uri_label_from_graph(uri) if label is not False: alt_label = label elif 'eol.org' in uri: eol_api = eolAPI() vocab_uri = eolAPI().VOCAB_URI labels = eol_api.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] elif 'wikipedia.org' in uri: # page name in the URI of the article link_ex = uri.split('/') label = urlunquote(link_ex[-1]) label = label.replace('_', ' ') # underscores in Wikipedia titles alt_label = label vocab_uri = 'http://www.wikipedia.org/' elif 'vocab.getty.edu/aat' in uri: print('Finding: ' + uri) getty_api = gettyAPI() vocab_uri = gettyAPI().VOCAB_URI labels = getty_api.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] elif 'numismatics.org/ocre/id/' in uri: print('Finding: ' + uri) ANSochre = ANSochreAPI() vocab_uri = ANSochreAPI().VOCAB_URI labels = ANSochre.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] if label is not False and vocab_uri is not False: # ok to make an entity then! ent = LinkEntity() ent.uri = uri ent.label = label ent.alt_label = alt_label ent.vocab_uri = vocab_uri ent.ent_type = ent_type ent.save()
def save_csv_from_filecache(self): """ updates Open Context to save new sites and annotations from the file cache """ if self.filecache is not None: # print('Cache update !: ' + self.cache_filekey) self.filecache.working_dir = self.working_dir json_obj = self.filecache.get_dict_from_file(self.cache_filekey) filename = self.cache_filekey + '.csv' directory = self.filecache.prep_directory(self.working_dir) dir_filename = os.path.join(directory, filename) if isinstance(json_obj, dict): if 'trinomial_refs' in json_obj: field_name_row = [ 'County Code', 'County Name', 'Trinomial', 'Citation', 'URI', 'Title', 'Note' ] f = codecs.open(dir_filename, 'w', encoding='utf-8') writer = csv.writer(f, dialect=csv.excel, quoting=csv.QUOTE_ALL) writer.writerow(field_name_row) for tri_ref in json_obj['trinomial_refs']: citation = tri_ref['citation_html'] uri = tri_ref['rec_uri'] title = tri_ref['title'] if len(title) > 194: title = title[0:190] + '... ' l_exists = LinkEntity.objects.filter(uri=uri)[:0] if len(l_exists) < 1: l_ent = LinkEntity() l_ent.uri = uri l_ent.label = title l_ent.alt_label = title l_ent.vocab_uri = tri_ref['source_uri'] l_ent.ent_type = 'class' l_ent.save() if 'note' in tri_ref: note = tri_ref['note'] else: note = '' for trinomial in tri_ref['trinomials']: county_code = trinomial[0:4] act_county_name = None for key, county_name in self.COUNTY_PREFIXES.items(): if county_code == key: act_county_name = county_name break print('County code: ' + county_code + ' is ' + str(act_county_name)) row = [ county_code, act_county_name, trinomial, citation, uri, title, note ] writer.writerow(row) print('Done!') f.closed
def match_trinomial_obj(self, tri): """ Attempts to match a trinomial object 'tri' against tDAR, if it hasn't yet been matched """ found_matches = 0 manifest = False try: manifest = Manifest.objects.get(uuid=tri.uuid) except Manifest.DoesNotExist: manifest = False la_check = LinkAnnotation.objects\ .filter(subject=tri.uuid, predicate_uri='dc-terms:subject', object_uri__contains=self.TDAR_VOCAB)[:1] if len(la_check) < 1 and manifest is not False: # we don't already have a tDAR id for this item, continue with matches tri_man = TrinomialManage() request_keywords = [tri.trinomial] if self.lead_zero_check: # check multiple leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 4: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts[ 'county'] + site request_keywords.append(new_trinomial) for keyword in request_keywords: tdar_api = tdarAPI() results = tdar_api.get_site_keyword(keyword) if isinstance(results, list): for result in results[:self.max_results]: # assume it is a spurious match match_real = False if result['label'] == tri.trinomial: # the trinomial and the tDAR result exactly match match_real = True else: # check if the only difference is in leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 5: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts[ 'county'] + site if new_trinomial == result['label']: # A good match, the tDAR result and the trinomial # match (but with different leading zeros) match_real = True if match_real: found_matches += 1 # OK! Found a match, first save the linked entity in the link entity table le_check = False try: le_check = LinkEntity.objects.get( uri=result['id']) except LinkEntity.DoesNotExist: le_check = False if le_check is False: le = LinkEntity() le.uri = result['id'] le.label = result['label'] le.alt_label = result['label'] le.vocab_uri = self.TDAR_VOCAB le.ent_type = 'type' le.save() # Now save the link annotation la = LinkAnnotation() la.subject = tri.uuid la.subject_type = manifest.item_type la.project_uuid = manifest.project_uuid la.source_id = 'tdar-api-lookup' la.predicate_uri = self.DC_TERMS_SUBJECT la.object_uri = result['id'] la.save() else: print('Almost! ' + result['label'] + ' is not exactly: ' + tri.trinomial) if tdar_api.request_error: self.request_error = True print('HTTP request to tDAR failed!') self.error_wait += self.base_wait if self.error_wait > self.max_wait: print('Too many failures, quiting...') sys.exit('Quitting process') else: # sleep some minutes before trying again print('Will try again in ' + str(self.error_wait) + ' seconds...') sleep(self.error_wait) else: self.request_error = False if self.error_wait >= self.base_wait: print('HTTP requests resumed OK, will continue.') self.error_wait = 0 return found_matches
def match_trinomial_obj(self, tri): """ Attempts to match a trinomial object 'tri' against tDAR, if it hasn't yet been matched """ found_matches = 0 manifest = False try: manifest = Manifest.objects.get(uuid=tri.uuid) except Manifest.DoesNotExist: manifest = False la_check = LinkAnnotation.objects\ .filter(subject=tri.uuid, predicate_uri='dc-terms:subject', object_uri__contains=self.TDAR_VOCAB)[:1] if len(la_check) < 1 and manifest is not False: # we don't already have a tDAR id for this item, continue with matches tri_man = TrinomialManage() request_keywords = [tri.trinomial] if self.lead_zero_check: # check multiple leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 4: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts['county'] + site request_keywords.append(new_trinomial) for keyword in request_keywords: tdar_api = tdarAPI() results = tdar_api.get_site_keyword(keyword) if isinstance(results, list): for result in results[:self.max_results]: # assume it is a spurious match match_real = False if result['label'] == tri.trinomial: # the trinomial and the tDAR result exactly match match_real = True else: # check if the only difference is in leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 5: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts['county'] + site if new_trinomial == result['label']: # A good match, the tDAR result and the trinomial # match (but with different leading zeros) match_real = True if match_real: found_matches += 1 # OK! Found a match, first save the linked entity in the link entity table le_check = False try: le_check = LinkEntity.objects.get(uri=result['id']) except LinkEntity.DoesNotExist: le_check = False if le_check is False: le = LinkEntity() le.uri = result['id'] le.label = result['label'] le.alt_label = result['label'] le.vocab_uri = self.TDAR_VOCAB le.ent_type = 'type' le.save() # Now save the link annotation la = LinkAnnotation() la.subject = tri.uuid la.subject_type = manifest.item_type la.project_uuid = manifest.project_uuid la.source_id = 'tdar-api-lookup' la.predicate_uri = self.DC_TERMS_SUBJECT la.object_uri = result['id'] la.save() else: print('Almost! ' + result['label'] + ' is not exactly: ' + tri.trinomial) if tdar_api.request_error: self.request_error = True print('HTTP request to tDAR failed!') self.error_wait += self.base_wait if self.error_wait > self.max_wait: print('Too many failures, quiting...') sys.exit('Quitting process') else: # sleep some minutes before trying again print('Will try again in ' + str(self.error_wait) + ' seconds...') sleep(self.error_wait) else: self.request_error = False if self.error_wait >= self.base_wait: print('HTTP requests resumed OK, will continue.') self.error_wait = 0 return found_matches
def check_add_link_entity(self, uri): """ Checks to see if an entity exists, if not, it adds it if we recognize the URI to be part of a known vocabulary """ ent = LinkEntity.objects.filter(uri=uri).first() if ent: # We found the linked data entity. return ent label = None alt_label = None ent_type = 'class' vocab_uri = None if '.geonames.org' in uri: geo_api = GeonamesAPI() vocab_uri = GeonamesAPI().VOCAB_URI labels = geo_api.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] elif 'UBERON' in uri: uber_api = uberonAPI() vocab_uri = uberonAPI().VOCAB_URI label = uber_api.get_uri_label_from_graph(uri) if label is not False: alt_label = label elif 'eol.org' in uri: eol_api = eolAPI() vocab_uri = eolAPI().VOCAB_URI labels = eol_api.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] elif 'wikipedia.org' in uri: # page name in the URI of the article link_ex = uri.split('/') label = urlunquote(link_ex[-1]) label = label.replace('_', ' ') # underscores in Wikipedia titles alt_label = label vocab_uri = 'http://www.wikipedia.org/' elif 'vocab.getty.edu/aat' in uri: print('Finding: ' + uri) getty_api = gettyAPI() vocab_uri = gettyAPI().VOCAB_URI labels = getty_api.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] elif 'numismatics.org/ocre/id/' in uri: print('Finding: ' + uri) ANSochre = ANSochreAPI() vocab_uri = ANSochreAPI().VOCAB_URI labels = ANSochre.get_labels_for_uri(uri) if isinstance(labels, dict): # got the label! label = labels['label'] alt_label = labels['alt_label'] elif 'gbif.org/species/' in uri: ent = add_get_gbif_link_entity_and_hierarchy(uri) # This adds the linked entity to the database # as well as its hierarchy return ent if not label or not vocab_uri: # Something went wrong. Could not # add the item return None # ok to make an entity then! ent = LinkEntity() ent.uri = uri ent.label = label ent.alt_label = alt_label ent.vocab_uri = vocab_uri ent.ent_type = ent_type ent.sort = '' ent.save() return ent
def match_california_site(self, site_uuid): """ Attempts to match California site name with a tDAR site key word """ found_matches = 0 oc_item = OCitem() exists = oc_item.check_exists(site_uuid) if exists: la_check = LinkAnnotation.objects\ .filter(subject=site_uuid, predicate_uri='dc-terms:subject', object_uri__contains=self.TDAR_VOCAB)[:1] if exists and len(la_check) < 1: # we don't already have a tDAR id for this item, continue with matches # first, generate the item's JSON-LD oc_item.generate_json_ld() request_keywords = [] if 'oc-gen:has-obs' in oc_item.json_ld: if isinstance(oc_item.json_ld['oc-gen:has-obs'], list): for obs in oc_item.json_ld['oc-gen:has-obs']: if 'oc-pred:52-alternate-site-or-place-name' in obs: if isinstance(obs['oc-pred:52-alternate-site-or-place-name'], list): for name_obj in obs['oc-pred:52-alternate-site-or-place-name']: if 'xsd:string' in name_obj: if isinstance(name_obj['xsd:string'], str): name_str = name_obj['xsd:string'] request_keywords.append(name_str) print('Checking names in tDAR: ' + '; '.join(request_keywords)) for keyword in request_keywords: tdar_api = tdarAPI() results = tdar_api.get_site_keyword(keyword) if isinstance(results, list): for result in results[:self.max_results]: # assume it is a spurious match match_real = False lw_result = result['label'].lower() lw_keyword = keyword.lower() if lw_result == lw_keyword: # the trinomial and the tDAR result exactly match match_real = True if match_real: print('FOUND ' + result['label']) found_matches += 1 # OK! Found a match, first save the linked entity in the link entity table le_check = False try: le_check = LinkEntity.objects.get(uri=result['id']) except LinkEntity.DoesNotExist: le_check = False if le_check is False: le = LinkEntity() le.uri = result['id'] le.label = result['label'] le.alt_label = result['label'] le.vocab_uri = self.TDAR_VOCAB le.ent_type = 'type' le.save() # Now save the link annotation la = LinkAnnotation() la.subject = oc_item.manifest.uuid la.subject_type = oc_item.manifest.item_type la.project_uuid = oc_item.manifest.project_uuid la.source_id = 'tdar-api-lookup' la.predicate_uri = self.DC_TERMS_SUBJECT la.object_uri = result['id'] la.save() else: print('Almost! ' + result['label'] + ' is not exactly: ' + keyword) if tdar_api.request_error: self.request_error = True print('HTTP request to tDAR failed!') self.error_wait += self.base_wait if self.error_wait > self.max_wait: print('Too many failures, quiting...') sys.exit('Quitting process') else: # sleep some minutes before trying again print('Will try again in ' + str(self.error_wait) + ' seconds...') sleep(self.error_wait) else: self.request_error = False if self.error_wait >= self.base_wait: print('HTTP requests resumed OK, will continue.') self.error_wait = 0 return found_matches
def match_california_site(self, site_uuid): """ Attempts to match California site name with a tDAR site key word """ found_matches = 0 oc_item = OCitem() exists = oc_item.check_exists(site_uuid) if exists: la_check = LinkAnnotation.objects\ .filter(subject=site_uuid, predicate_uri='dc-terms:subject', object_uri__contains=self.TDAR_VOCAB)[:1] if exists and len(la_check) < 1: # we don't already have a tDAR id for this item, continue with matches # first, generate the item's JSON-LD oc_item.generate_json_ld() request_keywords = [] if 'oc-gen:has-obs' in oc_item.json_ld: if isinstance(oc_item.json_ld['oc-gen:has-obs'], list): for obs in oc_item.json_ld['oc-gen:has-obs']: if 'oc-pred:52-alternate-site-or-place-name' in obs: if isinstance( obs['oc-pred:52-alternate-site-or-place-name'], list): for name_obj in obs[ 'oc-pred:52-alternate-site-or-place-name']: if 'xsd:string' in name_obj: if isinstance(name_obj['xsd:string'], str): name_str = name_obj['xsd:string'] request_keywords.append(name_str) print('Checking names in tDAR: ' + '; '.join(request_keywords)) for keyword in request_keywords: tdar_api = tdarAPI() results = tdar_api.get_site_keyword(keyword) if isinstance(results, list): for result in results[:self.max_results]: # assume it is a spurious match match_real = False lw_result = result['label'].lower() lw_keyword = keyword.lower() if lw_result == lw_keyword: # the trinomial and the tDAR result exactly match match_real = True if match_real: print('FOUND ' + result['label']) found_matches += 1 # OK! Found a match, first save the linked entity in the link entity table le_check = False try: le_check = LinkEntity.objects.get( uri=result['id']) except LinkEntity.DoesNotExist: le_check = False if le_check is False: le = LinkEntity() le.uri = result['id'] le.label = result['label'] le.alt_label = result['label'] le.vocab_uri = self.TDAR_VOCAB le.ent_type = 'type' le.save() # Now save the link annotation la = LinkAnnotation() la.subject = oc_item.manifest.uuid la.subject_type = oc_item.manifest.item_type la.project_uuid = oc_item.manifest.project_uuid la.source_id = 'tdar-api-lookup' la.predicate_uri = self.DC_TERMS_SUBJECT la.object_uri = result['id'] la.save() else: print('Almost! ' + result['label'] + ' is not exactly: ' + keyword) if tdar_api.request_error: self.request_error = True print('HTTP request to tDAR failed!') self.error_wait += self.base_wait if self.error_wait > self.max_wait: print('Too many failures, quiting...') sys.exit('Quitting process') else: # sleep some minutes before trying again print('Will try again in ' + str(self.error_wait) + ' seconds...') sleep(self.error_wait) else: self.request_error = False if self.error_wait >= self.base_wait: print('HTTP requests resumed OK, will continue.') self.error_wait = 0 return found_matches