def make_list_cite_projects(self, json_ld): """ makes a string for citation of projects """ projects_list = [] cite_projects_list = [] if 'dc-terms:source' in json_ld: for item in json_ld['dc-terms:source']: cite_projects_list.append(item['label']) proj_item = {} if 'rdfs:isDefinedBy' in item: proj_item['uuid'] = URImanagement.get_uuid_from_oc_uri( item['rdfs:isDefinedBy'], False) proj_item['uri'] = item['rdfs:isDefinedBy'] else: proj_item['uuid'] = URImanagement.get_uuid_from_oc_uri( item['id'], False) proj_item['uri'] = item['id'] proj_item['label'] = item['label'] if 'count' in item: proj_item['count'] = item['count'] else: proj_item['count'] = False projects_list.append(proj_item) self.cite_projects = ', '.join(cite_projects_list) self.projects_list = projects_list return self.cite_projects
def make_list_cite_projects(self, json_ld): """ makes a string for citation of projects """ projects_list = [] cite_projects_list = [] if 'dc-terms:source' in json_ld: for item in json_ld['dc-terms:source']: cite_projects_list.append(item['label']) proj_item = {} if 'rdfs:isDefinedBy' in item: proj_item['uuid'] = URImanagement.get_uuid_from_oc_uri(item['rdfs:isDefinedBy'], False) proj_item['uri'] = item['rdfs:isDefinedBy'] else: proj_item['uuid'] = URImanagement.get_uuid_from_oc_uri(item['id'], False) proj_item['uri'] = item['id'] proj_item['label'] = item['label'] if 'count' in item: proj_item['count'] = item['count'] else: proj_item['count'] = False projects_list.append(proj_item) self.cite_projects = ', '.join(cite_projects_list) self.projects_list = projects_list return self.cite_projects
def get_entity_parents(self, identifier): """ Gets parent concepts for a given URI or UUID identified entity """ self.loop_count += 1 lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ preds_for_superobjs = lequiv.get_identifier_list_variants(p_for_superobjs) p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ preds_for_subobjs = lequiv.get_identifier_list_variants(p_for_subobjs) try: # look for superior items in the objects of the assertion # sorting by sort so we can privelage a certain hierarchy path superobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=preds_for_superobjs)\ .exclude(object_uri__in=identifiers)\ .order_by('sort', 'object_uri')[:1] if(len(superobjs_anno) < 1): superobjs_anno = False except LinkAnnotation.DoesNotExist: superobjs_anno = False if(superobjs_anno is not False): parent_id = superobjs_anno[0].object_uri if(parent_id.count('/') > 1): oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if(oc_uuid is not False): parent_id = oc_uuid if(parent_id not in self.parent_entities): self.parent_entities.append(parent_id) if self.loop_count <= 50: self.parent_entities = self.get_entity_parents(parent_id) try: """ Now look for superior entities in the subject, not the object sorting by sort so we can privelage a certain hierarchy path """ supersubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=preds_for_subobjs)\ .exclude(subject__in=identifiers)\ .order_by('sort', 'subject')[:1] if(len(supersubj_anno) < 1): supersubj_anno = False except LinkAnnotation.DoesNotExist: supersubj_anno = False if supersubj_anno is not False: parent_id = supersubj_anno[0].subject if(parent_id.count('/') > 1): oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if(oc_uuid is not False): parent_id = oc_uuid if(parent_id not in self.parent_entities): self.parent_entities.append(parent_id) if self.loop_count <= 50: self.parent_entities = self.get_entity_parents(parent_id) return self.parent_entities
def get_entity_parents(self, identifier): """ Gets parent concepts for a given URI or UUID identified entity """ self.loop_count += 1 lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ preds_for_superobjs = lequiv.get_identifier_list_variants( p_for_superobjs) p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ preds_for_subobjs = lequiv.get_identifier_list_variants(p_for_subobjs) try: # look for superior items in the objects of the assertion superobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=preds_for_superobjs)\ .exclude(object_uri__in=identifiers)[:1] if (len(superobjs_anno) < 1): superobjs_anno = False except LinkAnnotation.DoesNotExist: superobjs_anno = False if (superobjs_anno is not False): parent_id = superobjs_anno[0].object_uri if (parent_id.count('/') > 1): oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if (oc_uuid is not False): parent_id = oc_uuid if (parent_id not in self.parent_entities): self.parent_entities.append(parent_id) if self.loop_count <= 50: self.parent_entities = self.get_entity_parents(parent_id) try: """ Now look for superior entities in the subject, not the object """ supersubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=preds_for_subobjs)\ .exclude(subject__in=identifiers)[:1] if (len(supersubj_anno) < 1): supersubj_anno = False except LinkAnnotation.DoesNotExist: supersubj_anno = False if (supersubj_anno is not False): parent_id = supersubj_anno[0].subject if (parent_id.count('/') > 1): oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if (oc_uuid is not False): parent_id = oc_uuid if (parent_id not in self.parent_entities): self.parent_entities.append(parent_id) if self.loop_count <= 50: self.parent_entities = self.get_entity_parents(parent_id) return self.parent_entities
def get_identifier_list_variants(self, id_list): """ makes different variants of identifiers for a list of identifiers """ output_list = [] if not isinstance(id_list, list): id_list = [str(id_list)] for identifier in id_list: output_list.append(identifier) if(identifier.startswith('http://') or identifier.startswith('https://')): oc_uuid = URImanagement.get_uuid_from_oc_uri(identifier) if oc_uuid: output_list.append(oc_uuid) prefix_id = URImanagement.prefix_common_uri(identifier) if prefix_id: output_list.append(prefix_id) elif ':' in identifier: full_uri = URImanagement.convert_prefix_to_full_uri(identifier) output_list.append(full_uri) else: # probably an open context uuid or a slug m_cache = MemoryCache() ent = m_cache.get_entity(identifier) if ent: full_uri = ent.uri output_list.append(full_uri) prefix_uri = URImanagement.prefix_common_uri(full_uri) if prefix_uri != full_uri: output_list.append(prefix_uri) return output_list
def get_identifier_list_variants(self, id_list): """ makes different variants of identifiers for a list of identifiers """ output_list = [] if not isinstance(id_list, list): id_list = [str(id_list)] for identifier in id_list: output_list.append(identifier) if(identifier[:7] == 'http://' or identifier[:8] == 'https://'): oc_uuid = URImanagement.get_uuid_from_oc_uri(identifier) if oc_uuid is not False: output_list.append(oc_uuid) else: prefix_id = URImanagement.prefix_common_uri(identifier) output_list.append(prefix_id) elif ':' in identifier: full_uri = URImanagement.convert_prefix_to_full_uri(identifier) output_list.append(full_uri) else: # probably an open context uuid or a slug ent = Entity() found = ent.dereference(identifier) if found: full_uri = ent.uri output_list.append(full_uri) prefix_uri = URImanagement.prefix_common_uri(full_uri) if prefix_uri != full_uri: output_list.append(prefix_uri) return output_list
def _get_entity_children_db(self, identifier, recursive=True): """ Gets child concepts for a given URI or UUID identified entity """ if not self.child_entities: self.child_entities = LastUpdatedOrderedDict() if identifier in self.child_entities and recursive: output = self.child_entities[identifier] else: act_children = [] p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) try: # look for child items in the objects of the assertion subobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=p_for_subobjs) if(len(subobjs_anno) < 1): subobjs_anno = False except LinkAnnotation.DoesNotExist: subobjs_anno = False if subobjs_anno is not False: for sub_obj in subobjs_anno: child_id = sub_obj.object_uri act_children.append(child_id) try: """ Now look for subordinate entities in the subject, not the object """ subsubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=p_for_superobjs) if len(subsubj_anno) < 1: subsubj_anno = False except LinkAnnotation.DoesNotExist: subsubj_anno = False if subsubj_anno is not False: for sub_sub in subsubj_anno: child_id = sub_sub.subject act_children.append(child_id) if len(act_children) > 0: identifier_children = [] for child_id in act_children: if child_id.count('/') > 1: oc_uuid = URImanagement.get_uuid_from_oc_uri(child_id) if oc_uuid: child_id = oc_uuid identifier_children.append(child_id) # recursively get the children of the child if recursive: self.get_entity_children(child_id, recursive) # same the list of children of the current identified item if identifier not in self.child_entities: self.child_entities[identifier] = identifier_children else: # save a False for the current identified item. it has no children if identifier not in self.child_entities: self.child_entities[identifier] = [] output = self.child_entities[identifier] return output
def get_item_basics(self, solr_rec): """ get basic metadata for an item """ output = False if isinstance(solr_rec, dict): if 'uuid' in solr_rec: self.uuid = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts( solr_rec['slug_type_uri_label']) if id_parts is not False: output = True self.uri = self.make_url_from_val_string( id_parts['uri'], True) self.href = self.make_url_from_val_string( id_parts['uri'], False) item_type_output = URImanagement.get_uuid_from_oc_uri( self.uri, True) self.item_type = item_type_output['item_type'] self.label = id_parts['label'] if 'updated' in solr_rec: self.updated = solr_rec['updated'] if 'published' in solr_rec: self.published = solr_rec['published'] if 'human_remains' in solr_rec: # is the record flagged as related to human remains ?human_remains if solr_rec['human_remains'] > 0: self.human_remains_flagged = True return output
def process_id(self, identifier): # check for identifier query_dict = {'fq': [], 'facet.field': []} fq_terms = [] escape_id = self.escape_solr_arg(identifier) fq_terms.append('persistent_uri:' + escape_id) # now make a DOI URI in case this is just a naked DOI doi_uri = self.escape_solr_arg('http://dx.doi.org/' + identifier) fq_terms.append('persistent_uri:' + doi_uri) # now make an ARK URI in case this is just a naked ARK ark_uri = self.escape_solr_arg('http://n2t.net/' + identifier) fq_terms.append('persistent_uri:' + ark_uri) # now make an ORCID URI in case this is just a naked ORCID orcid_uri = self.escape_solr_arg('http://orcid.org/' + identifier) fq_terms.append('persistent_uri:' + orcid_uri) fq_terms.append('uuid:' + escape_id) tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True) if tcheck is not False: uuid = tcheck['uuid'] fq_terms.append('uuid:' + uuid) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) # print(fq_final) return query_dict
def get_identifier_list_variants(self, id_list): """ makes different variants of identifiers for a list of identifiers """ output_list = [] if not isinstance(id_list, list): id_list = [str(id_list)] for identifier in id_list: output_list.append(identifier) if (identifier[:7] == 'http://' or identifier[:8] == 'https://'): oc_uuid = URImanagement.get_uuid_from_oc_uri(identifier) if oc_uuid is not False: output_list.append(oc_uuid) else: prefix_id = URImanagement.prefix_common_uri(identifier) output_list.append(prefix_id) elif ':' in identifier: full_uri = URImanagement.convert_prefix_to_full_uri(identifier) output_list.append(full_uri) else: # probably an open context uuid or a slug ent = Entity() found = ent.dereference(identifier) if found: full_uri = ent.uri output_list.append(full_uri) prefix_uri = URImanagement.prefix_common_uri(full_uri) if prefix_uri != full_uri: output_list.append(prefix_uri) return output_list
def dereference(self, identifier, link_entity_slug=False): """ Dereferences an entity identified by an identifier, checks if a URI, if, not a URI, then looks in the OC manifest for the item """ output = False # Only try to dereference if the identifier is a string. if not isinstance(identifier, str): return output identifier = URImanagement.convert_prefix_to_full_uri(identifier) oc_uuid = URImanagement.get_uuid_from_oc_uri(identifier) if not oc_uuid and (settings.CANONICAL_HOST + '/tables/') in identifier: # Special case for probable open context table item. oc_uuid = identifier.replace( (settings.CANONICAL_HOST + '/tables/'), '') if not oc_uuid: # We don't have an Open Context UUID, so look up a linked # data entity. link_entity_found = self.dereference_linked_data( identifier, link_entity_slug=link_entity_slug) if link_entity_found: # Found what we want, so skip the rest and return True. return True # If we haven't found a link_entity, check for manifest items. if oc_uuid: # We found an Open Context uuid by parsing a URI. So that # should be the identifier to lookup. identifier = oc_uuid manifest_item_found = self.dereference_manifest_item(identifier) if manifest_item_found: return True return output
def _get_parent_id_db(self, identifier): """Get the parent id for the current identifier """ parent_id = None lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) # print('identifiers: {}'.format(identifiers)) p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ preds_for_superobjs = lequiv.get_identifier_list_variants(p_for_superobjs) p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ preds_for_subobjs = lequiv.get_identifier_list_variants(p_for_subobjs) try: # look for superior items in the objects of the assertion # sorting by sort so we can privelage a certain hierarchy path superobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=preds_for_superobjs)\ .exclude(object_uri__in=identifiers)\ .order_by('sort', 'object_uri')[:1] if len(superobjs_anno) < 1: superobjs_anno = False except LinkAnnotation.DoesNotExist: superobjs_anno = False if superobjs_anno: parent_id = superobjs_anno[0].object_uri # print('Subject {} is child of {}'.format(identifiers, parent_id)) oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if oc_uuid: parent_id = oc_uuid try: """ Now look for superior entities in the subject, not the object sorting by sort so we can privelage a certain hierarchy path """ supersubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=preds_for_subobjs)\ .exclude(subject__in=identifiers)\ .order_by('sort', 'subject')[:1] if len(supersubj_anno) < 1: supersubj_anno = False except LinkAnnotation.DoesNotExist: supersubj_anno = False if supersubj_anno: parent_id = supersubj_anno[0].subject # print('Subject {} is parent of {}'.format(parent_id, identifiers)) oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if oc_uuid: parent_id = oc_uuid return parent_id
def process_equivalent_linked_data(self): """ Types are useful for entity reconciliation this checks for linked data associated with a type """ for equiv_uri in self.LD_EQUIVALENT_PREDICATES: if equiv_uri in self.oc_item.json_ld and "foaf" not in equiv_uri: # for now, default to a close match fname = "skos_closematch___pred_id" allname = "obj_all___skos_closematch___pred_id" if fname not in self.fields: self.fields[fname] = [] if self.ROOT_LINK_DATA_SOLR not in self.fields: self.fields[self.ROOT_LINK_DATA_SOLR] = [] item = self._concat_solr_string_value( "skos-closematch", "id", "http://www.w3.org/2004/02/skos/core#closeMatch", "Close Match" ) self.fields[self.ROOT_LINK_DATA_SOLR].append(item) if allname not in self.fields: self.fields[allname] = [] for entity in self.oc_item.json_ld[equiv_uri]: if "http://" in entity["id"] or "https://" in entity["id"]: self.fields["text"] += entity["label"] + "\n" self.fields["text"] += entity["id"] + "\n" item = self._concat_solr_string_value(entity["slug"], "id", entity["id"], entity["label"]) self.fields[fname].append(item) self.fields[allname].append(item) self.process_object_uri(entity["id"]) if "skos:related" in self.oc_item.json_ld: fname = "skos_related___pred_id" allname = "obj_all___skos_related___pred_id" if fname not in self.fields: self.fields[fname] = [] if self.ROOT_LINK_DATA_SOLR not in self.fields: self.fields[self.ROOT_LINK_DATA_SOLR] = [] item = self._concat_solr_string_value( "skos-related", "id", "http://www.w3.org/2004/02/skos/core#related", "Related" ) self.fields[self.ROOT_LINK_DATA_SOLR].append(item) if allname not in self.fields: self.fields[allname] = [] for entity in self.oc_item.json_ld["skos:related"]: if "http://" in entity["id"] or "https://" in entity["id"]: self.fields["text"] += entity["label"] + "\n" self.fields["text"] += entity["id"] + "\n" item = self._concat_solr_string_value(entity["slug"], "id", entity["id"], entity["label"]) self.fields[fname].append(item) self.fields[allname].append(item) self.process_object_uri(entity["id"]) elif "oc-pred:" in entity["id"] and "owl:sameAs" in entity: pred_uuid = URImanagement.get_uuid_from_oc_uri(entity["owl:sameAs"]) self.fields["text"] += entity["label"] + "\n" self.fields["text"] += entity["id"] + "\n" item = self._concat_solr_string_value( entity["slug"], "id", "/predicates/" + pred_uuid, entity["label"] ) self.fields[fname].append(item) self.fields[allname].append(item)
def parse_json_record(self, json_rec): """ parses json for a geo-json feature of the record """ if 'properties' in json_rec: props = json_rec['properties'] else: props = json_rec if isinstance(props, dict): if 'id' in props: self.id = props['id'].replace('#', '') if 'label' in props: self.label = props['label'] if 'href' in props: self.href = props['href'] if 'uri' in props: item_type_output = URImanagement.get_uuid_from_oc_uri(props['uri'], True) if isinstance(item_type_output, dict): self.item_type = item_type_output['item_type'] self.uuid = item_type_output['uuid'] if 'project label' in props: self.project = props['project label'] if 'context label' in props: self.context = props['context label'] if 'early bce/ce' in props: self.early_bce_ce = props['early bce/ce'] if self.early_bce_ce < 0: self.early_bce_ce = int(round(self.early_bce_ce * -1, 0)) self.early_suffix = 'BCE' else: self.early_bce_ce = int(round(self.early_bce_ce, 0)) self.early_suffix = False if 'late bce/ce' in props: self.late_bce_ce = props['late bce/ce'] if self.late_bce_ce < 0: self.late_bce_ce = int(round(self.late_bce_ce * -1, 0)) self.late_suffix = 'BCE' else: self.late_bce_ce = int(round(self.late_bce_ce, 0)) self.late_suffix = False if 'item category' in props: self.category = props['item category'] if 'snippet' in props: self.snippet = props['snippet'] self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]') self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]') self.snippet = strip_tags(self.snippet) self.snippet = self.snippet.replace('</', '') self.snippet = self.snippet.replace('<', '') self.snippet = self.snippet.replace('>', '') self.snippet = self.snippet.replace('[[[[mark]]]]', '<mark>') self.snippet = self.snippet.replace('[[[[/mark]]]]', '</mark>') if 'thumbnail' in props: self.thumbnail = props['thumbnail'] if 'published' in props: self.published = QueryMaker().make_human_readable_date(props['published']) if 'updated' in props: self.updated = QueryMaker().make_human_readable_date(props['updated'])
def get_project_authors(self, project_uuid): """ Gets author information for a project """ output = False creator_links = LinkAnnotation.objects\ .filter(Q(subject=project_uuid), Q(predicate_uri=self.URI_DC_CREATE) | Q(predicate_uri=self.PRF_DC_CREATE))\ .order_by('sort') if len(creator_links) < 1: # look for creators from the parent project par_proj = Project.objects\ .filter(uuid=project_uuid)\ .exclude(project_uuid=project_uuid)[:1] if len(par_proj) > 0: creator_links = LinkAnnotation.objects\ .filter(Q(subject=par_proj[0].project_uuid), Q(predicate_uri=self.URI_DC_CREATE) | Q(predicate_uri=self.PRF_DC_CREATE))\ .order_by('sort') if len(creator_links) > 0: for creator in creator_links: pid = URImanagement.get_uuid_from_oc_uri(creator.object_uri) if pid is False: pid = creator.object_uri if pid not in self.creators: self.creators.append(pid) contrib_links = LinkAnnotation.objects\ .filter(Q(subject=project_uuid), Q(predicate_uri=self.URI_DC_CONTRIB) | Q(predicate_uri=self.PRF_DC_CONTRIB))\ .order_by('sort') for contrib in contrib_links: pid = URImanagement.get_uuid_from_oc_uri(contrib.object_uri) if pid is False: pid = contrib.object_uri if pid not in self.contributors: if pid not in self.creators \ or self.consolidate_authorship is False\ or contrib.sort > 0: self.contributors.append( pid) # add to contrib if not a creator if len(self.contributors) > 0 or len(self.creators) > 0: output = True return output
def get_project_authors(self, project_uuid): """ Gets author information for a project """ output = False creator_links = LinkAnnotation.objects\ .filter(Q(subject=project_uuid), Q(predicate_uri=self.URI_DC_CREATE) | Q(predicate_uri=self.PRF_DC_CREATE))\ .order_by('sort') if len(creator_links) < 1: # look for creators from the parent project par_proj = Project.objects\ .filter(uuid=project_uuid)\ .exclude(project_uuid=project_uuid)[:1] if len(par_proj) > 0: creator_links = LinkAnnotation.objects\ .filter(Q(subject=par_proj[0].project_uuid), Q(predicate_uri=self.URI_DC_CREATE) | Q(predicate_uri=self.PRF_DC_CREATE))\ .order_by('sort') if len(creator_links) > 0: for creator in creator_links: pid = URImanagement.get_uuid_from_oc_uri(creator.object_uri) if not pid: pid = creator.object_uri if pid not in self.creators: self.creators.append(pid) contrib_links = LinkAnnotation.objects\ .filter(Q(subject=project_uuid), Q(predicate_uri=self.URI_DC_CONTRIB) | Q(predicate_uri=self.PRF_DC_CONTRIB))\ .order_by('sort') for contrib in contrib_links: pid = URImanagement.get_uuid_from_oc_uri(contrib.object_uri) if not pid: pid = contrib.object_uri if pid not in self.contributors: if pid not in self.creators \ or self.consolidate_authorship is False\ or contrib.sort > 0: self.contributors.append(pid) # add to contrib if not a creator if len(self.contributors) > 0 or len(self.creators) > 0: output = True return output
def get_entity_children(self, identifier, recurive=True): """ Gets child concepts for a given URI or UUID identified entity """ act_children = [] p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) try: # look for child items in the objects of the assertion subobjs_anno = LinkAnnotation.objects.filter( subject__in=identifiers, predicate_uri__in=p_for_subobjs) if (len(subobjs_anno) < 1): subobjs_anno = False except LinkAnnotation.DoesNotExist: subobjs_anno = False if subobjs_anno is not False: for sub_obj in subobjs_anno: child_id = sub_obj.object_uri act_children.append(child_id) try: """ Now look for subordinate entities in the subject, not the object """ subsubj_anno = LinkAnnotation.objects.filter( object_uri__in=identifiers, predicate_uri__in=p_for_superobjs) if len(subsubj_anno) < 1: subsubj_anno = False except LinkAnnotation.DoesNotExist: subsubj_anno = False if subsubj_anno is not False: for sub_sub in subsubj_anno: child_id = sub_sub.subject act_children.append(child_id) if len(act_children) > 0: identifier_children = [] for child_id in act_children: if child_id.count('/') > 1: oc_uuid = URImanagement.get_uuid_from_oc_uri(child_id) if oc_uuid is not False: child_id = oc_uuid identifier_children.append(child_id) # recursively get the children of the child self.get_entity_children(child_id, recurive) # same the list of children of the current identified item if identifier not in self.child_entities: self.child_entities[identifier] = identifier_children else: # save a False for the current identified item. it has no children if identifier not in self.child_entities: self.child_entities[identifier] = False
def check_opencontext_uri(self, cell): """ looks for a valid opencontext uri in a cell """ oc_item = False if 'http://opencontext.' in cell\ or 'https://opencontext.' in cell: uuid = URImanagement.get_uuid_from_oc_uri(cell) if uuid is not False: # appears to be an Open Context URI # now check we actually have that entity in the database try: oc_item = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: oc_item = False return oc_item
def check_opencontext_uri(self, cell): """ looks for a valid opencontext uri in a cell """ oc_item = False if 'http://opencontext.' in cell\ or 'https://opencontext.' in cell: uuid = URImanagement.get_uuid_from_oc_uri(cell) if uuid is not False: # appears to be an Open Context URI # now check we actually have that entity in the database try: oc_item = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: oc_item = False return oc_item
def add_json_ld_link_annotations(self, json_ld): """ adds linked data annotations (typically referencing URIs from outside Open Context) """ if not self.link_annotations: return json_ld if not len(self.link_annotations): return json_ld parts_json_ld = PartsJsonLD() parts_json_ld.proj_context_json_ld = self.proj_context_json_ld parts_json_ld.manifest_obj_dict = self.manifest_obj_dict for la in self.link_annotations: tcheck = URImanagement.get_uuid_from_oc_uri(la.object_uri, True) if not tcheck: # this item is NOT from open context item_type = False else: # an Open Context item item_type = tcheck['item_type'] if item_type == 'persons': # add a stable ID to person items, but only if they are ORCID IDs parts_json_ld.stable_id_predicate = ItemKeys.PREDICATES_FOAF_PRIMARYTOPICOF parts_json_ld.stable_id_prefix_limit = StableIdentifer.ID_TYPE_PREFIXES[ 'orcid'] # this shortens URIs in item-context declared namespaces # to make a compact URI (prefixed), as the act_pred act_pred = URImanagement.prefix_common_uri(la.predicate_uri) if act_pred not in self.dc_author_preds \ and act_pred not in self.dc_inherit_preds \ and act_pred not in self.dc_metadata_preds: # the act_pred is not a dublin core predicate, so we're OK to add it # now, not later. if not biological_taxonomy_validation(act_pred, la.object_uri): # We have a act_pred and object_uri combination # that is not valid. So skip. continue json_ld = parts_json_ld.addto_predicate_list( json_ld, act_pred, la.object_uri, item_type) else: # we've got dublin core assertions, cache these in the dict_object # dc_assertions so they get added LAST, after other asserttions self.dc_assertions = parts_json_ld.addto_predicate_list( self.dc_assertions, act_pred, la.object_uri, item_type) return json_ld
def add_json_ld_link_annotations(self, json_ld): """ adds linked data annotations (typically referencing URIs from outside Open Context) """ if not self.link_annotations or not len(self.link_annotations): # No link annotations, so skip out. return json_ld # We have link annotations. parts_json_ld = PartsJsonLD() parts_json_ld.proj_context_json_ld = self.proj_context_json_ld parts_json_ld.manifest_obj_dict = self.manifest_obj_dict for la in self.link_annotations: tcheck = URImanagement.get_uuid_from_oc_uri(la.object_uri, True) if not tcheck: # this item is NOT from open context item_type = False else: # an Open Context item item_type = tcheck['item_type'] if item_type == 'persons': # add a stable ID to person items, but only if they are ORCID IDs parts_json_ld.stable_id_predicate = ItemKeys.PREDICATES_FOAF_PRIMARYTOPICOF parts_json_ld.stable_id_prefix_limit = StableIdentifer.ID_TYPE_PREFIXES['orcid'] # this shortens URIs in item-context declared namespaces # to make a compact URI (prefixed), as the act_pred act_pred = URImanagement.prefix_common_uri(la.predicate_uri) if act_pred not in self.dc_author_preds \ and act_pred not in self.dc_inherit_preds \ and act_pred not in self.dc_metadata_preds: # the act_prec is not a dublin core predicate, so we're OK to add it # now, not later. json_ld = parts_json_ld.addto_predicate_list( json_ld, act_pred, la.object_uri, item_type ) else: # we've got dublin core assertions, cache these in the dict_object # dc_assertions so they get added LAST, after other asserttions self.dc_assertions = parts_json_ld.addto_predicate_list( self.dc_assertions, act_pred, la.object_uri, item_type ) return json_ld
def get_solr_record_uuid_type(self, solr_rec): """ get item uuid, label, and type from a solr_rec """ output = False if isinstance(solr_rec, dict): output = {'uuid': False, 'label': False, 'item_type': False} if 'uuid' in solr_rec: output['uuid'] = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label']) if id_parts is not False: uri = self.make_url_from_val_string(id_parts['uri'], True) item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True) output['item_type'] = item_type_output['item_type'] output['label'] = id_parts['label'] return output
def get_solr_record_uuid_type(self, solr_rec): """ get item uuid, label, and type from a solr_rec """ output = False if isinstance(solr_rec, dict): output = {'uuid': False, 'label': False, 'item_type': False} if 'uuid' in solr_rec: output['uuid'] = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label']) if id_parts is not False: uri = self.make_url_from_val_string(id_parts['uri'], True) item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True) output['item_type'] = item_type_output['item_type'] output['label'] = id_parts['label'] return output
def process_id(self, identifier): # check for identifier query_dict = {'fq': [], 'facet.field': []} fq_terms = [] id_list = [identifier] id_list = self.make_http_https_options(id_list) for act_id in id_list: escape_id = self.escape_solr_arg(act_id) fq_terms.append('persistent_uri:' + escape_id) fq_terms.append('uuid:' + escape_id) # now make URIs in case we have a naked identifier prefix_removes = [ 'doi:', 'orcid:', 'http://dx.doi.org/', 'https://dx.doi.org/', 'http://doi.org/', 'https://doi.org/' ] for prefix in prefix_removes: # strip ID prefixes, case insensitive re_gone = re.compile(re.escape(prefix), re.IGNORECASE) identifier = re_gone.sub('', identifier) uris = [ 'http://dx.doi.org/' + identifier, # DOI (old) 'http://doi.org/' + identifier, # DOI (new) 'http://n2t.net/' + identifier, # ARK (CDL / Merritt) 'http://orcid.org/' + identifier # Orcid (people) ] # now make https http varients of the URIs uris = self.make_http_https_options(uris) for uri in uris: # now make a DOI URI in case this is just a naked DOI escaped_uri = self.escape_solr_arg(uri) fq_terms.append('persistent_uri:' + escaped_uri) tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True) if tcheck is not False: uuid = tcheck['uuid'] fq_terms.append('uuid:' + uuid) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) # print(fq_final) return query_dict
def get_item_json_ld(self, item): """ gets metadata and uris """ output = False if 'uri' in item: tcheck = URImanagement.get_uuid_from_oc_uri(item['uri'], True) if tcheck is False: item_type = False else: uuid = tcheck['uuid'] item_type = tcheck['item_type'] ocitem = OCitem() ocitem.get_item(uuid) if ocitem.manifest is not False: output = ocitem.json_ld else: output = False return output
def get_item_json_ld(self, item): """ gets metadata and uris """ output = False if 'uri' in item: tcheck = URImanagement.get_uuid_from_oc_uri(item['uri'], True) if tcheck is False: item_type = False else: uuid = tcheck['uuid'] item_type = tcheck['item_type'] ocitem = OCitem() ocitem.get_item(uuid) if ocitem.manifest is not False: output = ocitem.json_ld else: output = False return output
def load_csv(self, filename, after=0, add_path=False): """ loads CSV dump from Merritt """ if add_path: filename_path = os.path.join(settings.STATIC_ROOT, self.DEFAULT_DIRECTORY, filename) else: filename_path = filename data = csv.reader(open(filename_path)) i = 0 for row in data: manifest = False if 'ark:/' in row[0]: i += 0 if i >= after: uuid = URImanagement.get_uuid_from_oc_uri(row[1]) if uuid is not False: try: manifest = Manifest.objects.get(uuid=uuid, archived__isnull=True) except Manifest.DoesNotExist: manifest = False if manifest is not False: ok_new = True try: sid = StableIdentifer() sid.stable_id = row[0].replace('ark:/', '') sid.stable_type = 'ark' sid.uuid = manifest.uuid sid.project_uuid = manifest.project_uuid sid.item_type = manifest.item_type sid.save() except: ok_new = False # note when the item was last archived try: manifest.archived = self.validate_date(row[3]) manifest.archived_save() except: manifest.archived = time.strftime('%Y-%m-%d %H:%M:%S') manifest.archived_save() if ok_new: self.id_recorded += 1 print('Saved ids: ' + str(self.id_recorded))
def load_csv(self, filename, after=0, add_path=False): """ loads CSV dump from Merritt """ if add_path: filename_path = os.path.join(settings.STATIC_ROOT, self.DEFAULT_DIRECTORY, filename) else: filename_path = filename data = csv.reader(open(filename_path)) i = 0 for row in data: manifest = False if 'ark:/' in row[0]: i += 0 if i >= after: uuid = URImanagement.get_uuid_from_oc_uri(row[1]) if uuid is not False: try: manifest = Manifest.objects.get( uuid=uuid, archived__isnull=True) except Manifest.DoesNotExist: manifest = False if manifest is not False: ok_new = True try: sid = StableIdentifer() sid.stable_id = row[0].replace('ark:/', '') sid.stable_type = 'ark' sid.uuid = manifest.uuid sid.project_uuid = manifest.project_uuid sid.item_type = manifest.item_type sid.save() except: ok_new = False # note when the item was last archived try: manifest.archived = self.validate_date(row[3]) manifest.archived_save() except: manifest.archived = time.strftime( '%Y-%m-%d %H:%M:%S') manifest.archived_save() if ok_new: self.id_recorded += 1 print('Saved ids: ' + str(self.id_recorded))
def get_predicate_uuids(self, pred_keys, item_dict): """ gets uuids for open context items for a given LIST of predicate keys for an item dict """ uuids = [] if not isinstance(pred_keys, list): pred_keys = [pred_keys] if isinstance(item_dict, dict): for pred_key in pred_keys: items = item_dict.get(pred_key, []) if not isinstance(items, list): continue for item in items: if not isinstance(item, dict) or not 'id' in item: continue uuid = URImanagement.get_uuid_from_oc_uri(item['id']) if not uuid or uuid in uuids: continue uuids.append(uuid) return uuids
def get_item_json_ld(self, item): """ gets metadata and uris """ output = False if 'uri' in item: tcheck = URImanagement.get_uuid_from_oc_uri(item['uri'], True) if tcheck is False: item_type = False else: uuid = tcheck['uuid'] item_type = tcheck['item_type'] url = self.base_url + '/' + item_type + '/' + uuid header = {'Accept': 'application/json'} try: r = requests.get(url, headers=header, timeout=60) r.raise_for_status() output = r.json() except: output = False return output
def get_item_basics(self, solr_rec): """ get basic metadata for an item """ output = False if isinstance(solr_rec, dict): if 'uuid' in solr_rec: self.uuid = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label']) if id_parts is not False: output = True self.uri = self.make_url_from_val_string(id_parts['uri'], True) self.href = self.make_url_from_val_string(id_parts['uri'], False) item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True) self.item_type = item_type_output['item_type'] self.label = id_parts['label'] if 'updated' in solr_rec: self.updated = solr_rec['updated'] if 'published' in solr_rec: self.published = solr_rec['published'] return output
def set_record_basic_metadata(self, solr_doc): """Sets the record's general metadata attributes""" self.uuid = solr_doc.get('uuid') item_dict = utilities.parse_solr_encoded_entity_str( solr_doc.get('slug_type_uri_label', ''), solr_slug_format=False, ) if not item_dict: return None # Add the item local url for this deployment self.href = make_url_from_partial_url( item_dict.get('uri', ''), base_url=self.base_url, ) # Add the item "cannonical" uri self.uri = make_url_from_partial_url(item_dict.get('uri', ''), base_url=settings.CANONICAL_HOST) item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True) self.item_type = item_type_output['item_type'] self.label = item_dict.get('label') self.slug = item_dict.get('slug')
def get_predicate_uuids(self, pred_keys, item_dict): """ gets uuids for open context items for a given LIST of predicate keys for an item dict """ uuids = [] if not isinstance(pred_keys, list): pred_keys = [pred_keys] if isinstance(item_dict, dict): for pred_key in pred_keys: if pred_key in item_dict: items = item_dict[pred_key] if isinstance(items, list): for item in items: if isinstance(item, dict): if 'id' in item: uuid = URImanagement.get_uuid_from_oc_uri( item['id']) if isinstance(uuid, str): if uuid not in uuids: uuids.append(uuid) return uuids
def get_item_json_ld(self, item): """ gets metadata and uris """ output = False if 'uri' in item: tcheck = URImanagement.get_uuid_from_oc_uri(item['uri'], True) if tcheck is False: item_type = False else: uuid = tcheck['uuid'] item_type = tcheck['item_type'] url = self.base_url + '/' + item_type + '/' + uuid header = {'Accept': 'application/json'} try: r = requests.get(url, headers=header, timeout=60) r.raise_for_status() output = r.json() except: output = False return output
def add_ids(self, ids): """ Adds ids to the database """ if isinstance(ids, list): for id_rec in ids: id_and_type = self.parse_stable_id(id_rec['stable_id']) manifest = False uuid = URImanagement.get_uuid_from_oc_uri(id_rec['id']) if uuid is not False and id_and_type is not False: try: manifest = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: manifest = False if manifest is not False: # we found the archived item in the manifest # save the stable identifier in the database ok_new = True try: sid = StableIdentifer() sid.stable_id = id_and_type['id'] sid.stable_type = id_and_type['type'] sid.uuid = manifest.uuid sid.project_uuid = manifest.project_uuid sid.item_type = manifest.item_type sid.save() except: ok_new = False # note when the item was last archived try: manifest.archived = self.validate_date( id_rec['archived']) manifest.archived_save() except: manifest.archived = time.strftime('%Y-%m-%d %H:%M:%S') manifest.archived_save() if ok_new: self.id_recorded += 1 return self.id_recorded
def add_ids(self, ids): """ Adds ids to the database """ if isinstance(ids, list): for id_rec in ids: id_and_type = self.parse_stable_id(id_rec['stable_id']) manifest = False uuid = URImanagement.get_uuid_from_oc_uri(id_rec['id']) if uuid is not False and id_and_type is not False: try: manifest = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: manifest = False if manifest is not False: # we found the archived item in the manifest # save the stable identifier in the database ok_new = True try: sid = StableIdentifer() sid.stable_id = id_and_type['id'] sid.stable_type = id_and_type['type'] sid.uuid = manifest.uuid sid.project_uuid = manifest.project_uuid sid.item_type = manifest.item_type sid.save() except: ok_new = False # note when the item was last archived try: manifest.archived = self.validate_date(id_rec['archived']) manifest.archived_save() except: manifest.archived = time.strftime('%Y-%m-%d %H:%M:%S') manifest.archived_save() if ok_new: self.id_recorded += 1 return self.id_recorded
def dereference(self, identifier, link_entity_slug=False): """ Dereferences an entity identified by an identifier, checks if a URI, if, not a URI, then looks in the OC manifest for the item """ output = False try_manifest = True identifier = URImanagement.convert_prefix_to_full_uri(identifier) if (link_entity_slug or (len(identifier) > 8)): if (link_entity_slug or (identifier[:7] == 'http://' or identifier[:8] == 'https://')): try: try_manifest = False ld_entity = LinkEntity.objects.get( Q(uri=identifier) | Q(slug=identifier)) except LinkEntity.DoesNotExist: ld_entity = False if (ld_entity is not False): output = True self.uri = ld_entity.uri self.slug = ld_entity.slug self.label = ld_entity.label self.item_type = 'uri' self.alt_label = ld_entity.alt_label self.entity_type = ld_entity.ent_type self.vocab_uri = ld_entity.vocab_uri self.ld_object_ok = True try: vocab_entity = LinkEntity.objects.get( uri=self.vocab_uri) except LinkEntity.DoesNotExist: vocab_entity = False if (vocab_entity is not False): self.vocabulary = vocab_entity.label if self.get_icon: prefix_uri = URImanagement.prefix_common_uri( ld_entity.uri) icon_anno = LinkAnnotation.objects\ .filter(Q(subject=ld_entity.uri) | Q(subject=identifier) | Q(subject=prefix_uri), predicate_uri='oc-gen:hasIcon')[:1] if len(icon_anno) > 0: self.icon = icon_anno[0].object_uri else: try_manifest = True # couldn't find the item in the linked entities table identifier = URImanagement.get_uuid_from_oc_uri(identifier) if (try_manifest): try: manifest_item = Manifest.objects.get( Q(uuid=identifier) | Q(slug=identifier)) except Manifest.DoesNotExist: manifest_item = False if (manifest_item is not False): output = True self.uri = URImanagement.make_oc_uri(manifest_item.uuid, manifest_item.item_type) self.uuid = manifest_item.uuid self.slug = manifest_item.slug self.label = manifest_item.label self.item_type = manifest_item.item_type self.class_uri = manifest_item.class_uri self.project_uuid = manifest_item.project_uuid if (manifest_item.item_type == 'media' and self.get_thumbnail): # a media item. get information about its thumbnail. try: thumb_obj = Mediafile.objects.get( uuid=manifest_item.uuid, file_type='oc-gen:thumbnail') except Mediafile.DoesNotExist: thumb_obj = False if thumb_obj is not False: self.thumbnail_media = thumb_obj self.thumbnail_uri = thumb_obj.file_uri elif (manifest_item.item_type == 'types'): tl = TypeLookup() tl.get_octype_without_manifest(identifier) self.content = tl.content elif (manifest_item.item_type == 'predicates'): try: oc_pred = Predicate.objects.get( uuid=manifest_item.uuid) except Predicate.DoesNotExist: oc_pred = False if (oc_pred is not False): self.data_type = oc_pred.data_type elif (manifest_item.item_type == 'subjects' and self.get_context): try: subj = Subject.objects.get(uuid=manifest_item.uuid) except Subject.DoesNotExist: subj = False if subj is not False: self.context = subj.context return output
def _process_predicate_values(self, predicate_slug, predicate_type): # First generate the solr field name solr_field_name = self._convert_slug_to_solr( predicate_slug + self._get_predicate_type_string(predicate_type, prefix="___pred_") ) # Then get the predicate values if solr_field_name not in self.fields: self.fields[solr_field_name] = [] if self.oc_item.item_type == "media" or self.oc_item.item_type == "documents": # we want to make joins easier for these types of items make_join_ids = True else: make_join_ids = False predicate_key = "oc-pred:" + predicate_slug for obs_list in self.oc_item.json_ld["oc-gen:has-obs"]: if predicate_key in obs_list: predicate_values = obs_list[predicate_key] for value in predicate_values: if predicate_type == "@id": if make_join_ids and "subjects" in value["id"]: # case where we want to make a join field to link # associated subjects items with media or document # items allows join relationships between # 'join___pred_id' and 'uuid' solr fields. if "join___pred_id" not in self.fields: self.fields["join___pred_id"] = [] # get subjects UUID from the URI sub_uuid = URImanagement.get_uuid_from_oc_uri(value["id"]) # append to the solr field for joins self.fields["join___pred_id"].append(sub_uuid) if predicate_slug != "link": active_solr_field = solr_field_name parents = LinkRecursion().get_jsonldish_entity_parents(value["id"]) all_obj_solr_field = "obj_all___" + active_solr_field if all_obj_solr_field not in self.fields: self.fields[all_obj_solr_field] = [] for parent in parents: if active_solr_field not in self.fields: self.fields[active_solr_field] = [] active_solr_value = self._concat_solr_string_value( parent["slug"], self._get_predicate_type_string(parent["type"]), parent["id"], parent["label"], ) self.fields["text"] += " " + parent["label"] + " " self.fields[active_solr_field].append(active_solr_value) # so all items in the hiearchy are present in the # and can be queried, even if you don't know the parent self.fields[all_obj_solr_field].append(active_solr_value) active_solr_field = self._convert_slug_to_solr(parent["slug"]) + "___" + solr_field_name else: # case of a linking relation, don't bother looking # up hierarchies or recording as a solr field, but # check for image, other media, and document counts if "media" in value["id"] and "image" in value["type"]: self.fields["image_media_count"] += 1 elif "media" in value["id"] and "image" not in value["type"]: # other types of media self.fields["other_binary_media_count"] += 1 elif "documents" in value["id"]: self.fields["document_count"] += 1 self.fields["text"] += value["label"] + " " elif predicate_type in ["xsd:integer", "xsd:double", "xsd:boolean"]: self.fields[solr_field_name].append(value) elif predicate_type == "xsd:date": self.fields[solr_field_name].append(value + "T00:00:00Z") elif predicate_type == "xsd:string": self.fields["text"] += value["xsd:string"] + " \n" self.fields[solr_field_name].append(value["xsd:string"]) else: raise Exception("Error: Could not get predicate value") self.fields["text"] += " \n"
def get_identifier_query_dict(raw_identifier): """Make a query dict for identifiers""" if not raw_identifier: return None query_dict = {'fq': []} fq_terms = [] values_list = utilities.infer_multiple_or_hierarchy_paths( raw_identifier, or_delim=configs.REQUEST_OR_OPERATOR, hierarchy_delim=None) id_list = [] for value in values_list: if not value: continue id_list += utilities.make_uri_equivalence_list(value) for act_id in id_list: # The act_id maybe a persistent URI, escape it and # query the persistent_uri string. escape_id = utilities.escape_solr_arg(act_id) fq_terms.append('persistent_uri:{}'.format(escape_id)) if ':' in act_id: # Skip below, because the act_id has a # character that's not in uuids or slugs. continue # The act_id maybe a UUID. fq_terms.append('uuid:{}'.format(act_id)) # The act_id maybe a slug, so do a prefix query # for document slug_type_uri_label. fq_terms.append('slug_type_uri_label:{}'.format( utilities.fq_slug_value_format(act_id))) # Now make URIs in case we have a naked identifier prefix_removes = [ 'doi:', 'orcid:', 'http://dx.doi.org/', 'https://dx.doi.org/', 'http://doi.org/', 'https://doi.org/' ] for value in values_list: if not value: continue for prefix in prefix_removes: # strip ID prefixes, case insensitive re_gone = re.compile(re.escape(prefix), re.IGNORECASE) identifier = re_gone.sub('', value) if (identifier.startswith('http://') or identifier.startswith('https://')): continue # Only loop through URI templaces for N2T if # we have an ARK identifier. if identifier.startswith('ark:'): uri_templates = configs.N2T_URI_TEMPLATES else: uri_templates = configs.PERSISTENT_URI_TEMPLATES for uri_template in uri_templates: escaped_uri = utilities.escape_solr_arg( uri_template.format(id=identifier)) fq_term = 'persistent_uri:{}'.format(escaped_uri) if fq_term in fq_terms: # We already have this, so skip. continue fq_terms.append(fq_term) # Now see if there's a UUID in the identifier. oc_check = URImanagement.get_uuid_from_oc_uri(value, True) if oc_check: # We have an identifier we can interperate as an # Open Context URI. So extract the uuid part. fq_term = 'uuid:{}'.format(oc_check['uuid']) if fq_term in fq_terms: # We already have this, so skip. continue fq_terms.append('uuid:{}'.format(oc_check['uuid'])) # Join the various identifier queries as OR terms. query_dict['fq'].append( utilities.join_solr_query_terms(fq_terms, operator='OR')) return query_dict
def make_datacite_metadata_xml(self, parent_node, json_ld): """ makes metadata for the datacite specification, which is also used in the oai_datacite specification """ tcheck = URImanagement.get_uuid_from_oc_uri(json_ld['id'], True) if tcheck is False: item_type = False else: item_type = tcheck['item_type'] resource_xml = etree.SubElement(parent_node, 'resoure', nsmap={None: self.DATACITE_RESOURCE['ns']}, attrib={'{' + self.XSI_NS + '}schemaLocation': self.DATACITE_RESOURCE['schemaLocation']}) identifiers = {} if 'id' in json_ld: identifiers['URL'] = json_ld['id'] if 'owl:sameAs' in json_ld: if isinstance(json_ld['owl:sameAs'], list): for ld_item in json_ld['owl:sameAs']: if 'doi' in ld_item['id']: identifiers['DOI'] = ld_item['id'].replace('http://dx.doi.org/', '') if 'ark' in ld_item['id']: identifiers['ARK'] = ld_item['id'].replace('http://n2t.net/', '') if 'DOI' in identifiers: act_xml = etree.SubElement(resource_xml, 'identifier', attrib={'identifierType': 'DOI'}) act_xml.text = identifiers['DOI'] elif 'ARK' in identifiers: act_xml = etree.SubElement(resource_xml, 'identifier', attrib={'identifierType': 'ARK'}) act_xml.text = identifiers['ARK'] elif 'URL' in identifiers: act_xml = etree.SubElement(resource_xml, 'identifier', attrib={'identifierType': 'URL'}) act_xml.text = identifiers['URL'] act_node = etree.SubElement(resource_xml, 'titles') dc_title = None if 'dc-terms:title' in json_ld: act_xml = etree.SubElement(act_node, 'title') act_xml.text = json_ld['dc-terms:title'] dc_title = json_ld['dc-terms:title'] if 'label' in json_ld: if dc_title != json_ld['label']: act_xml = etree.SubElement(act_node, 'title') act_xml.text = json_ld['label'] if 'dc-terms:creator' in json_ld: if isinstance(json_ld['dc-terms:creator'], list): act_node = etree.SubElement(resource_xml, 'creators') for ld_item in json_ld['dc-terms:creator']: act_xml = etree.SubElement(act_node, 'creator') if 'label' in ld_item: act_xml.text = ld_item['label'] if 'dc-terms:contributor' in json_ld: if isinstance(json_ld['dc-terms:contributor'], list): act_node = etree.SubElement(resource_xml, 'contributors') for ld_item in json_ld['dc-terms:contributor']: act_xml = etree.SubElement(act_node, 'contributor') if 'label' in ld_item: act_xml.text = ld_item['label'] act_node = etree.SubElement(resource_xml, 'dates') create_date = time.strftime('%Y-%m-%d') if 'dc-terms:issued' in json_ld: create_date = json_ld['dc-terms:issued'] date_xml = etree.SubElement(act_node, 'date', attrib={'dateType': 'Available'}) date_xml.text = create_date if 'dc-terms:modified' in json_ld: mod_date = json_ld['dc-terms:modified'] date_xml = etree.SubElement(act_node, 'date', attrib={'dateType': 'Updated'}) date_xml.text = mod_date act_node = etree.SubElement(resource_xml, 'publisher') act_node.text = self.publisher_name act_node = etree.SubElement(resource_xml, 'publicationYear') act_node.text = create_date[:4] # the year, first 4 characters # now add the Datacite resource type if item_type in self.DATACITE_RESOURCE_TYPES: act_rt = self.DATACITE_RESOURCE_TYPES[item_type] else: act_rt = self.DATACITE_RESOURCE_TYPES['other'] rt_xml = etree.SubElement(resource_xml, 'resourceType', attrib={'resourceTypeGeneral': act_rt['ResourceTypeGeneral']}) rt_xml.text = act_rt['oc'] # now add relevant mime-types if item_type in self.DC_FORMATS: format_list = self.DC_FORMATS[item_type] if item_type == 'media': if 'oc-gen:has-files' in json_ld: if isinstance(json_ld['oc-gen:has-files'], list): for act_f in json_ld['oc-gen:has-files']: if 'type' in act_f and 'dc-terms:hasFormat' in act_f: if act_f['type'] == 'oc-gen:fullfile': mime_uri = act_f['dc-terms:hasFormat'] format_list.append(mime_uri.replace('http://purl.org/NET/mediatypes/', '')) else: format_list = self.DC_FORMATS['other'] act_node = etree.SubElement(resource_xml, 'formats') for mime in format_list: act_xml = etree.SubElement(act_node, 'format') act_xml.text = mime subjects_list = [] if 'category' in json_ld: cat = json_ld['category'][0] cat_label = self.get_category_label(cat, json_ld) if cat_label is not False: subjects_list.append(cat_label) if 'dc-terms:subject' in json_ld: if isinstance(json_ld['dc-terms:subject'], list): for subj in json_ld['dc-terms:subject']: if 'label' in subj: subjects_list.append(subj['label']) if len(subjects_list) > 0: act_node = etree.SubElement(resource_xml, 'subjects') for subject in subjects_list: act_xml = etree.SubElement(act_node, 'subject') act_xml.text = subject if 'dc-terms:isPartOf' in json_ld: if isinstance(json_ld['dc-terms:isPartOf'], list): for rel in json_ld['dc-terms:isPartOf']: if 'id' in rel: related = rel['id'] act_xml = etree.SubElement(resource_xml, 'RelatedIdentifier', attrib={'relatedIdentifierType': 'URL', 'relationType': 'IsPartOf'}) act_xml.text = related
def process_equivalent_linked_data(self): """ Types are useful for entity reconciliation this checks for linked data associated with a type """ for equiv_uri in self.LD_EQUIVALENT_PREDICATES: if equiv_uri in self.oc_item.json_ld \ and 'foaf' not in equiv_uri: # for now, default to a close match fname = 'skos_closematch___pred_id' allname = 'obj_all___skos_closematch___pred_id' if fname not in self.fields: self.fields[fname] = [] if self.ROOT_LINK_DATA_SOLR not in self.fields: self.fields[self.ROOT_LINK_DATA_SOLR] = [] item = self._concat_solr_string_value( 'skos-closematch', 'id', 'http://www.w3.org/2004/02/skos/core#closeMatch', 'Close Match') self.fields[self.ROOT_LINK_DATA_SOLR].append(item) if allname not in self.fields: self.fields[allname] = [] for entity in self.oc_item.json_ld[equiv_uri]: if 'http://' in entity['id'] \ or 'https://' in entity['id']: self.fields['text'] += entity['label'] + '\n' self.fields['text'] += entity['id'] + '\n' item = self._concat_solr_string_value( entity['slug'], 'id', entity['id'], entity['label']) self.fields[fname].append(item) self.fields[allname].append(item) self.process_object_uri(entity['id']) if 'skos:related' in self.oc_item.json_ld: fname = 'skos_related___pred_id' allname = 'obj_all___skos_related___pred_id' if fname not in self.fields: self.fields[fname] = [] if self.ROOT_LINK_DATA_SOLR not in self.fields: self.fields[self.ROOT_LINK_DATA_SOLR] = [] item = self._concat_solr_string_value( 'skos-related', 'id', 'http://www.w3.org/2004/02/skos/core#related', 'Related') self.fields[self.ROOT_LINK_DATA_SOLR].append(item) if allname not in self.fields: self.fields[allname] = [] for entity in self.oc_item.json_ld['skos:related']: if 'http://' in entity['id'] \ or 'https://' in entity['id']: self.fields['text'] += entity['label'] + '\n' self.fields['text'] += entity['id'] + '\n' item = self._concat_solr_string_value( entity['slug'], 'id', entity['id'], entity['label']) self.fields[fname].append(item) self.fields[allname].append(item) self.process_object_uri(entity['id']) elif 'oc-pred:' in entity['id'] \ and 'owl:sameAs' in entity: pred_uuid = URImanagement.get_uuid_from_oc_uri( entity['owl:sameAs'] ) self.fields['text'] += entity['label'] + '\n' self.fields['text'] += entity['id'] + '\n' item = self._concat_solr_string_value( entity['slug'], 'id', '/predicates/' + pred_uuid, entity['label']) self.fields[fname].append(item) self.fields[allname].append(item)
def make_datacite_metadata_xml(self, parent_node, json_ld): """ makes metadata for the datacite specification, which is also used in the oai_datacite specification """ tcheck = URImanagement.get_uuid_from_oc_uri(json_ld['id'], True) if tcheck is False: item_type = False else: item_type = tcheck['item_type'] resource_xml = etree.SubElement( parent_node, 'resoure', nsmap={None: self.DATACITE_RESOURCE['ns']}, attrib={ '{' + self.XSI_NS + '}schemaLocation': self.DATACITE_RESOURCE['schemaLocation'] }) identifiers = {} if 'id' in json_ld: identifiers['URL'] = json_ld['id'] if 'owl:sameAs' in json_ld: if isinstance(json_ld['owl:sameAs'], list): for ld_item in json_ld['owl:sameAs']: if 'doi' in ld_item['id']: identifiers['DOI'] = ld_item['id'].replace( 'http://dx.doi.org/', '') if 'ark' in ld_item['id']: identifiers['ARK'] = ld_item['id'].replace( 'http://n2t.net/', '') if 'DOI' in identifiers: act_xml = etree.SubElement(resource_xml, 'identifier', attrib={'identifierType': 'DOI'}) act_xml.text = identifiers['DOI'] elif 'ARK' in identifiers: act_xml = etree.SubElement(resource_xml, 'identifier', attrib={'identifierType': 'ARK'}) act_xml.text = identifiers['ARK'] elif 'URL' in identifiers: act_xml = etree.SubElement(resource_xml, 'identifier', attrib={'identifierType': 'URL'}) act_xml.text = identifiers['URL'] act_node = etree.SubElement(resource_xml, 'titles') dc_title = None if 'dc-terms:title' in json_ld: act_xml = etree.SubElement(act_node, 'title') act_xml.text = json_ld['dc-terms:title'] dc_title = json_ld['dc-terms:title'] if 'label' in json_ld: if dc_title != json_ld['label']: act_xml = etree.SubElement(act_node, 'title') act_xml.text = json_ld['label'] if 'dc-terms:creator' in json_ld: if isinstance(json_ld['dc-terms:creator'], list): act_node = etree.SubElement(resource_xml, 'creators') for ld_item in json_ld['dc-terms:creator']: act_xml = etree.SubElement(act_node, 'creator') if 'label' in ld_item: act_xml.text = ld_item['label'] if 'dc-terms:contributor' in json_ld: if isinstance(json_ld['dc-terms:contributor'], list): act_node = etree.SubElement(resource_xml, 'contributors') for ld_item in json_ld['dc-terms:contributor']: act_xml = etree.SubElement(act_node, 'contributor') if 'label' in ld_item: act_xml.text = ld_item['label'] act_node = etree.SubElement(resource_xml, 'dates') create_date = time.strftime('%Y-%m-%d') if 'dc-terms:issued' in json_ld: create_date = json_ld['dc-terms:issued'] date_xml = etree.SubElement(act_node, 'date', attrib={'dateType': 'Available'}) date_xml.text = create_date if 'dc-terms:modified' in json_ld: mod_date = json_ld['dc-terms:modified'] date_xml = etree.SubElement(act_node, 'date', attrib={'dateType': 'Updated'}) date_xml.text = mod_date act_node = etree.SubElement(resource_xml, 'publisher') act_node.text = self.publisher_name act_node = etree.SubElement(resource_xml, 'publicationYear') act_node.text = create_date[:4] # the year, first 4 characters # now add the Datacite resource type if item_type in self.DATACITE_RESOURCE_TYPES: act_rt = self.DATACITE_RESOURCE_TYPES[item_type] else: act_rt = self.DATACITE_RESOURCE_TYPES['other'] rt_xml = etree.SubElement( resource_xml, 'resourceType', attrib={'resourceTypeGeneral': act_rt['ResourceTypeGeneral']}) rt_xml.text = act_rt['oc'] # now add relevant mime-types if item_type in self.DC_FORMATS: format_list = self.DC_FORMATS[item_type] if item_type == 'media': if 'oc-gen:has-files' in json_ld: if isinstance(json_ld['oc-gen:has-files'], list): for act_f in json_ld['oc-gen:has-files']: if 'type' in act_f and 'dc-terms:hasFormat' in act_f: if act_f['type'] == 'oc-gen:fullfile': mime_uri = act_f['dc-terms:hasFormat'] format_list.append( mime_uri.replace( 'http://purl.org/NET/mediatypes/', '')) else: format_list = self.DC_FORMATS['other'] act_node = etree.SubElement(resource_xml, 'formats') for mime in format_list: act_xml = etree.SubElement(act_node, 'format') act_xml.text = mime subjects_list = [] if 'category' in json_ld: cat = json_ld['category'][0] cat_label = self.get_category_label(cat, json_ld) if cat_label is not False: subjects_list.append(cat_label) if 'dc-terms:subject' in json_ld: if isinstance(json_ld['dc-terms:subject'], list): for subj in json_ld['dc-terms:subject']: if 'label' in subj: subjects_list.append(subj['label']) if len(subjects_list) > 0: act_node = etree.SubElement(resource_xml, 'subjects') for subject in subjects_list: act_xml = etree.SubElement(act_node, 'subject') act_xml.text = subject if 'dc-terms:isPartOf' in json_ld: if isinstance(json_ld['dc-terms:isPartOf'], list): for rel in json_ld['dc-terms:isPartOf']: if 'id' in rel: related = rel['id'] act_xml = etree.SubElement(resource_xml, 'RelatedIdentifier', attrib={ 'relatedIdentifierType': 'URL', 'relationType': 'IsPartOf' }) act_xml.text = related
def make_dc_metadata_xml(self, parent_node, json_ld): """ makes metadata in the dublin core format """ act_format = self.get_metadata_format_attributes('oai_dc') if act_format is not False: tcheck = URImanagement.get_uuid_from_oc_uri(json_ld['id'], True) if tcheck is False: item_type = False else: item_type = tcheck['item_type'] dc = 'http://purl.org/dc/elements/1.1/' ns = {'dc': dc, 'oai_dc': act_format['ns'], 'xsi': self.XSI_NS} format_xml = etree.SubElement(parent_node, '{' + act_format['ns'] + '}dc', nsmap=ns, attrib={'{' + self.XSI_NS + '}schemaLocation': act_format['schemaLocation']}) title_xml = etree.SubElement(format_xml, '{' + dc + '}title') if 'dc-terms:title' in json_ld: title_xml.text = json_ld['dc-terms:title'] elif 'label' in json_ld: title_xml.text = json_ld['label'] if 'dc-terms:issued' in json_ld: dt_date = json_ld['dc-terms:issued'] date_xml = etree.SubElement(format_xml, '{' + dc + '}date') date_xml.text = dt_date if 'dc-terms:creator' in json_ld: if isinstance(json_ld['dc-terms:creator'], list): for ld_item in json_ld['dc-terms:creator']: act_xml = etree.SubElement(format_xml, '{' + dc + '}creator') if 'label' in ld_item: act_xml.text = ld_item['label'] if 'dc-terms:contributor' in json_ld: if isinstance(json_ld['dc-terms:contributor'], list): for ld_item in json_ld['dc-terms:contributor']: act_xml = etree.SubElement(format_xml, '{' + dc + '}contributor') if 'label' in ld_item: act_xml.text = ld_item['label'] if 'owl:sameAs' in json_ld: if isinstance(json_ld['owl:sameAs'], list): for ld_item in json_ld['owl:sameAs']: act_xml = etree.SubElement(format_xml, '{' + dc + '}identifier') act_xml.text = ld_item['id'] if 'id' in json_ld: act_xml = etree.SubElement(format_xml, '{' + dc + '}identifier') act_xml.text = json_ld['id'] if item_type in self.DATACITE_RESOURCE_TYPES: act_rt = self.DATACITE_RESOURCE_TYPES[item_type] else: act_rt = self.DATACITE_RESOURCE_TYPES['other'] rt_xml = etree.SubElement(format_xml, '{' + dc + '}type') rt_xml.text = act_rt['ResourceTypeGeneral'] publisher = etree.SubElement(format_xml, '{' + dc + '}publisher') publisher.text = self.publisher_name if item_type in self.DC_FORMATS: format_list = self.DC_FORMATS[item_type] if item_type == 'media': if 'oc-gen:has-files' in json_ld: if isinstance(json_ld['oc-gen:has-files'], list): for act_f in json_ld['oc-gen:has-files']: if 'type' in act_f and 'dc-terms:hasFormat' in act_f: if act_f['type'] == 'oc-gen:fullfile': mime_uri = act_f['dc-terms:hasFormat'] format_list.append(mime_uri.replace('http://purl.org/NET/mediatypes/', '')) else: format_list = self.DC_FORMATS['other'] for mime in format_list: act_xml = etree.SubElement(format_xml, '{' + dc + '}format') act_xml.text = mime subjects_list = [] if 'category' in json_ld: cat = json_ld['category'][0] cat_label = self.get_category_label(cat, json_ld) if cat_label is not False: subjects_list.append(cat_label) if 'dc-terms:subject' in json_ld: if isinstance(json_ld['dc-terms:subject'], list): for subj in json_ld['dc-terms:subject']: if 'label' in subj: subjects_list.append(subj['label']) if len(subjects_list) > 0: for subject in subjects_list: act_xml = etree.SubElement(format_xml, '{' + dc + '}subject') act_xml.text = subject
def get_description_tree(self, entity_obj, depth=1, first_time=True, item_type=False, class_uri=False): """ gets a hierarchy for descriptive predicates and types """ lr = LinkRecursion() if entity_obj.item_type == 'projects': tree = self.make_containment_item(entity_obj) if item_type is not False and class_uri is False: # returns the classes associated with an item_type for a project tree['label'] = tree['label'] + ', ' + item_type tree['children'] = self.get_proj_type_classes_items( entity_obj.uuid, item_type) elif item_type is not False and class_uri is not False: # returns the predicates associated with an item_type and class_uri tree['children'] = self.get_proj_type_class_preds( entity_obj.uuid, item_type, class_uri, True) else: # project root, returns the item_types for the project tree['children'] = self.get_proj_types(entity_obj.uuid) if first_time: output = [] output.append(tree) else: output = tree elif entity_obj.item_type == 'predicates': tree = self.make_containment_item(entity_obj) tree['children'] = [] child_list = lr.get_entity_children(entity_obj.uuid, False) if len(child_list) > 0: for child_uuid in child_list: child_ent = Entity() found = child_ent.dereference(child_uuid) if found: if depth > 1: child = self.get_containment_children( child_ent, depth - 1, False) else: child = self.make_containment_item(child_ent) tree['children'].append(child) elif entity_obj.data_type == 'id': top_types = lr.get_pred_top_rank_types(entity_obj.uuid) for top_type in top_types: uri = top_type['id'] uuid = URImanagement.get_uuid_from_oc_uri(uri) item = False if depth > 1: child_ent = Entity() found = child_ent.dereference(uuid) if found: item = self.get_description_tree( child_ent, depth - 1, False) else: item = LastUpdatedOrderedDict() item['id'] = uuid item['label'] = top_type['label'] item['class_uri'] = 'type' item['class_label'] = 'type' tree['children'].append(item) tree['children'] = self.sort_children_by_label( tree['children']) else: pass if first_time: output = [] output.append(tree) else: output = tree elif entity_obj.item_type == 'types': tree = self.make_containment_item(entity_obj) tree['children'] = [] act_children = lr.get_entity_children(entity_obj.uuid, False) for child_uuid in act_children: if child_uuid != entity_obj.uuid: child_ent = Entity() found = child_ent.dereference(child_uuid) if found: if depth > 1: child = self.get_description_tree( child_ent, depth - 1, False) else: child = self.make_containment_item(child_ent) child['class_uri'] = 'type' child['class_label'] = 'type' tree['children'].append(child) if len(tree['children']) == 0: tree.pop('children', None) else: tree['children'] = self.sort_children_by_label( tree['children']) if first_time: output = [] output.append(tree) else: output = tree else: output = [] return output
def dereference(self, identifier, link_entity_slug=False): """ Dereferences an entity identified by an identifier, checks if a URI, if, not a URI, then looks in the OC manifest for the item """ output = False try_manifest = True identifier = URImanagement.convert_prefix_to_full_uri(identifier) if(link_entity_slug or (len(identifier) > 8)): if(link_entity_slug or (identifier[:7] == 'http://' or identifier[:8] == 'https://')): try: try_manifest = False ld_entity = LinkEntity.objects.get(Q(uri=identifier) | Q(slug=identifier)) except LinkEntity.DoesNotExist: ld_entity = False if(ld_entity is not False): output = True self.uri = ld_entity.uri self.slug = ld_entity.slug self.label = ld_entity.label self.item_type = 'uri' self.alt_label = ld_entity.alt_label self.entity_type = ld_entity.ent_type self.vocab_uri = ld_entity.vocab_uri self.ld_object_ok = True try: vocab_entity = LinkEntity.objects.get(uri=self.vocab_uri) except LinkEntity.DoesNotExist: vocab_entity = False if(vocab_entity is not False): self.vocabulary = vocab_entity.label if self.get_icon: prefix_uri = URImanagement.prefix_common_uri(ld_entity.uri) icon_anno = LinkAnnotation.objects\ .filter(Q(subject=ld_entity.uri) | Q(subject=identifier) | Q(subject=prefix_uri), predicate_uri='oc-gen:hasIcon')[:1] if len(icon_anno) > 0: self.icon = icon_anno[0].object_uri else: try_manifest = True # couldn't find the item in the linked entities table identifier = URImanagement.get_uuid_from_oc_uri(identifier) if(try_manifest): try: manifest_item = Manifest.objects.get(Q(uuid=identifier) | Q(slug=identifier)) except Manifest.DoesNotExist: manifest_item = False if(manifest_item is not False): output = True self.uri = URImanagement.make_oc_uri(manifest_item.uuid, manifest_item.item_type) self.uuid = manifest_item.uuid self.slug = manifest_item.slug self.label = manifest_item.label self.item_type = manifest_item.item_type self.class_uri = manifest_item.class_uri self.project_uuid = manifest_item.project_uuid if(manifest_item.item_type == 'media' and self.get_thumbnail): # a media item. get information about its thumbnail. try: thumb_obj = Mediafile.objects.get(uuid=manifest_item.uuid, file_type='oc-gen:thumbnail') except Mediafile.DoesNotExist: thumb_obj = False if thumb_obj is not False: self.thumbnail_media = thumb_obj self.thumbnail_uri = thumb_obj.file_uri elif(manifest_item.item_type == 'types'): tl = TypeLookup() tl.get_octype_without_manifest(identifier) self.content = tl.content elif(manifest_item.item_type == 'predicates'): try: oc_pred = Predicate.objects.get(uuid=manifest_item.uuid) except Predicate.DoesNotExist: oc_pred = False if(oc_pred is not False): self.data_type = oc_pred.data_type elif(manifest_item.item_type == 'subjects' and self.get_context): try: subj = Subject.objects.get(uuid=manifest_item.uuid) except Subject.DoesNotExist: subj = False if subj is not False: self.context = subj.context return output
def _process_predicate_values(self, predicate_slug, predicate_type): # First generate the solr field name solr_field_name = self._convert_slug_to_solr( predicate_slug + self._get_predicate_type_string( predicate_type, prefix='___pred_') ) # Then get the predicate values if solr_field_name not in self.fields: self.fields[solr_field_name] = [] if self.oc_item.item_type == 'media' \ or self.oc_item.item_type == 'documents': # we want to make joins easier for these types of items make_join_ids = True else: make_join_ids = False predicate_key = 'oc-pred:' + predicate_slug for obs_list in self.oc_item.json_ld['oc-gen:has-obs']: if predicate_key in obs_list: predicate_values = obs_list[predicate_key] for value in predicate_values: if predicate_type == '@id': if make_join_ids and 'subjects' in value['id']: # case where we want to make a join field to link # associated subjects items with media or document # items allows join relationships between # 'join___pred_id' and 'uuid' solr fields. if 'join___pred_id' not in self.fields: self.fields['join___pred_id'] = [] # get subjects UUID from the URI sub_uuid = URImanagement.get_uuid_from_oc_uri( value['id'] ) # append to the solr field for joins self.fields['join___pred_id'].append(sub_uuid) if predicate_slug != 'link': active_solr_field = solr_field_name parents = LinkRecursion( ).get_jsonldish_entity_parents( value['id'] ) all_obj_solr_field = 'obj_all___' + active_solr_field if all_obj_solr_field not in self.fields: self.fields[all_obj_solr_field] = [] for parent in parents: if active_solr_field not in self.fields: self.fields[active_solr_field] = [] active_solr_value = \ self._concat_solr_string_value( parent['slug'], self._get_predicate_type_string( parent['type']), parent['id'], parent['label'] ) self.fields['text'] += ' ' + \ parent['label'] + ' ' self.fields[active_solr_field].append( active_solr_value ) # so all items in the hiearchy are present in the # and can be queried, even if you don't know the parent self.fields[all_obj_solr_field].append( active_solr_value ) active_solr_field = self._convert_slug_to_solr( parent['slug']) + '___' + solr_field_name else: # case of a linking relation, don't bother looking # up hierarchies or recording as a solr field, but # check for image, other media, and document counts if 'media' in value['id'] \ and 'image' in value['type']: self.fields['image_media_count'] += 1 elif 'media' in value['id'] \ and 'image' not in value['type']: # other types of media self.fields['other_binary_media_count'] += 1 elif 'documents' in value['id']: self.fields['document_count'] += 1 self.fields['text'] += value['label'] + ' ' elif predicate_type in [ 'xsd:integer', 'xsd:double', 'xsd:boolean' ]: self.fields[solr_field_name].append(value) elif predicate_type == 'xsd:date': self.fields[solr_field_name].append(value + 'T00:00:00Z') elif predicate_type == 'xsd:string': self.fields['text'] += value['xsd:string'] + ' \n' self.fields[solr_field_name].append( value['xsd:string']) else: raise Exception("Error: Could not get predicate value") self.fields['text'] += ' \n'
def dereference(self, identifier, link_entity_slug=False): """ Dereferences an entity identified by an identifier, checks if a URI, if, not a URI, then looks in the OC manifest for the item """ output = False if isinstance(identifier, str): # only try to dereference if the identifier is a string. try_manifest = True identifier = URImanagement.convert_prefix_to_full_uri(identifier) if (settings.CANONICAL_HOST + '/tables/') in identifier: identifier = identifier.replace((settings.CANONICAL_HOST + '/tables/'), '') if link_entity_slug or (len(identifier) > 8): if link_entity_slug or (identifier[:7] == 'http://' or identifier[:8] == 'https://'): ent_equivs = EntityEquivalents() uris = ent_equivs.make_uri_variants(identifier) ld_entities = LinkEntity.objects.filter(Q(uri__in=uris) | Q(slug=identifier))[:1] if len(ld_entities) > 0: ld_entity = ld_entities[0] else: ld_entity = False if ld_entity is not False: output = True self.uri = ld_entity.uri self.slug = ld_entity.slug self.label = ld_entity.label self.item_type = 'uri' self.alt_label = ld_entity.alt_label self.entity_type = ld_entity.ent_type self.vocab_uri = ld_entity.vocab_uri self.ld_object_ok = True try: if 'https://' in self.vocab_uri: alt_vocab_uri = self.vocab_uri.replace('https://', 'http://') else: alt_vocab_uri = self.vocab_uri.replace('http://', 'https://') vocab_entity = LinkEntity.objects.get(Q(uri=self.vocab_uri) | Q(uri=alt_vocab_uri)) except LinkEntity.DoesNotExist: vocab_entity = False if vocab_entity is not False: self.vocabulary = vocab_entity.label if self.get_icon: prefix_uri = URImanagement.prefix_common_uri(ld_entity.uri) icon_anno = LinkAnnotation.objects\ .filter(Q(subject=ld_entity.uri) | Q(subject=identifier) | Q(subject=prefix_uri), predicate_uri='oc-gen:hasIcon')[:1] if len(icon_anno) > 0: self.icon = icon_anno[0].object_uri else: try_manifest = True # couldn't find the item in the linked entities table identifier = URImanagement.get_uuid_from_oc_uri(identifier) if try_manifest: try: manifest_item = Manifest.objects.get(Q(uuid=identifier) | Q(slug=identifier)) except Manifest.DoesNotExist: manifest_item = False if manifest_item is not False: output = True self.uri = URImanagement.make_oc_uri(manifest_item.uuid, manifest_item.item_type) self.uuid = manifest_item.uuid self.slug = manifest_item.slug self.label = manifest_item.label self.item_type = manifest_item.item_type self.class_uri = manifest_item.class_uri self.project_uuid = manifest_item.project_uuid if manifest_item.item_type == 'media' and self.get_thumbnail: # a media item. get information about its thumbnail. try: thumb_obj = Mediafile.objects.get(uuid=manifest_item.uuid, file_type='oc-gen:thumbnail') except Mediafile.DoesNotExist: thumb_obj = False if thumb_obj is not False: self.thumbnail_media = thumb_obj self.thumbnail_uri = thumb_obj.file_uri elif manifest_item.item_type in ['persons', 'projects', 'tables'] \ or self.get_stable_ids: # get stable identifiers for persons or projects by default stable_ids = StableIdentifer.objects.filter(uuid=manifest_item.uuid) if len(stable_ids) > 0: self.stable_id_uris = [] doi_uris = [] orcid_uris = [] other_uris = [] for stable_id in stable_ids: if stable_id.stable_type in StableIdentifer.ID_TYPE_PREFIXES: prefix = StableIdentifer.ID_TYPE_PREFIXES[stable_id.stable_type] else: prefix = '' stable_uri = prefix + stable_id.stable_id if stable_id.stable_type == 'orcid': orcid_uris.append(stable_uri) elif stable_id.stable_type == 'doi': doi_uris.append(stable_uri) else: other_uris.append(stable_uri) # now list URIs in order of importance, with ORCIDs and DOIs # first, followed by other stable URI types (Arks or something else) self.stable_id_uris = orcid_uris + doi_uris + other_uris elif manifest_item.item_type == 'types': tl = TypeLookup() tl.get_octype_without_manifest(identifier) self.content = tl.content elif manifest_item.item_type == 'predicates': try: oc_pred = Predicate.objects.get(uuid=manifest_item.uuid) except Predicate.DoesNotExist: oc_pred = False if oc_pred is not False: self.data_type = oc_pred.data_type self.sort = oc_pred.sort self.slug_uri = 'oc-pred:' + str(self.slug) elif manifest_item.item_type == 'projects': # get a manifest object for the parent of a project, if it exists ch_tab = '"oc_projects" AS "child"' filters = 'child.project_uuid=oc_manifest.uuid '\ ' AND child.uuid=\'' + self.uuid + '\' ' \ ' AND child.project_uuid != \'' + self.uuid + '\' ' par_rows = Manifest.objects\ .filter(item_type='projects')\ .exclude(uuid=self.uuid)\ .extra(tables=[ch_tab], where=[filters])[:1] if len(par_rows) > 0: self.par_proj_man_obj = par_rows[0] elif manifest_item.item_type == 'subjects' and self.get_context: try: subj = Subject.objects.get(uuid=manifest_item.uuid) except Subject.DoesNotExist: subj = False if subj is not False: self.context = subj.context return output
def parse_json_record(self, json_rec): """ parses json for a geo-json feature of the record """ if 'properties' in json_rec: props = json_rec['properties'] else: props = json_rec if isinstance(props, dict): if 'id' in props: self.id = props['id'].replace('#', '') if 'label' in props: self.label = props['label'] if 'href' in props: self.href = props['href'] if 'uri' in props: item_type_output = URImanagement.get_uuid_from_oc_uri( props['uri'], True) if isinstance(item_type_output, dict): self.item_type = item_type_output['item_type'] self.uuid = item_type_output['uuid'] if 'project label' in props: self.project = props['project label'] if 'context label' in props: self.context = props['context label'] if 'early bce/ce' in props: self.early_bce_ce = props['early bce/ce'] if self.early_bce_ce < 0: self.early_bce_ce = int(round(self.early_bce_ce * -1, 0)) self.early_suffix = 'BCE' else: self.early_bce_ce = int(round(self.early_bce_ce, 0)) self.early_suffix = False if 'late bce/ce' in props: self.late_bce_ce = props['late bce/ce'] if self.late_bce_ce < 0: self.late_bce_ce = int(round(self.late_bce_ce * -1, 0)) self.late_suffix = 'BCE' else: self.late_bce_ce = int(round(self.late_bce_ce, 0)) self.late_suffix = False if 'item category' in props: self.category = props['item category'] if 'snippet' in props: self.snippet = props['snippet'] self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]') self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]') self.snippet = strip_tags(self.snippet) self.snippet = self.snippet.replace('</', '') self.snippet = self.snippet.replace('<', '') self.snippet = self.snippet.replace('>', '') self.snippet = self.snippet.replace('[[[[mark]]]]', '<mark>') self.snippet = self.snippet.replace('[[[[/mark]]]]', '</mark>') if 'thumbnail' in props: self.thumbnail = props['thumbnail'] if 'published' in props: self.published = QueryMaker().make_human_readable_date( props['published']) if 'updated' in props: self.updated = QueryMaker().make_human_readable_date( props['updated'])
def make_dc_metadata_xml(self, parent_node, json_ld): """ makes metadata in the dublin core format """ act_format = self.get_metadata_format_attributes('oai_dc') if act_format is not False: tcheck = URImanagement.get_uuid_from_oc_uri(json_ld['id'], True) if tcheck is False: item_type = False else: item_type = tcheck['item_type'] dc = 'http://purl.org/dc/elements/1.1/' ns = {'dc': dc, 'oai_dc': act_format['ns'], 'xsi': self.XSI_NS} format_xml = etree.SubElement( parent_node, '{' + act_format['ns'] + '}dc', nsmap=ns, attrib={ '{' + self.XSI_NS + '}schemaLocation': act_format['schemaLocation'] }) title_xml = etree.SubElement(format_xml, '{' + dc + '}title') if 'dc-terms:title' in json_ld: title_xml.text = json_ld['dc-terms:title'] elif 'label' in json_ld: title_xml.text = json_ld['label'] if 'dc-terms:issued' in json_ld: dt_date = json_ld['dc-terms:issued'] date_xml = etree.SubElement(format_xml, '{' + dc + '}date') date_xml.text = dt_date if 'dc-terms:creator' in json_ld: if isinstance(json_ld['dc-terms:creator'], list): for ld_item in json_ld['dc-terms:creator']: act_xml = etree.SubElement(format_xml, '{' + dc + '}creator') if 'label' in ld_item: act_xml.text = ld_item['label'] if 'dc-terms:contributor' in json_ld: if isinstance(json_ld['dc-terms:contributor'], list): for ld_item in json_ld['dc-terms:contributor']: act_xml = etree.SubElement(format_xml, '{' + dc + '}contributor') if 'label' in ld_item: act_xml.text = ld_item['label'] if 'owl:sameAs' in json_ld: if isinstance(json_ld['owl:sameAs'], list): for ld_item in json_ld['owl:sameAs']: act_xml = etree.SubElement(format_xml, '{' + dc + '}identifier') act_xml.text = ld_item['id'] if 'id' in json_ld: act_xml = etree.SubElement(format_xml, '{' + dc + '}identifier') act_xml.text = json_ld['id'] if item_type in self.DATACITE_RESOURCE_TYPES: act_rt = self.DATACITE_RESOURCE_TYPES[item_type] else: act_rt = self.DATACITE_RESOURCE_TYPES['other'] rt_xml = etree.SubElement(format_xml, '{' + dc + '}type') rt_xml.text = act_rt['ResourceTypeGeneral'] publisher = etree.SubElement(format_xml, '{' + dc + '}publisher') publisher.text = self.publisher_name if item_type in self.DC_FORMATS: format_list = self.DC_FORMATS[item_type] if item_type == 'media': if 'oc-gen:has-files' in json_ld: if isinstance(json_ld['oc-gen:has-files'], list): for act_f in json_ld['oc-gen:has-files']: if 'type' in act_f and 'dc-terms:hasFormat' in act_f: if act_f['type'] == 'oc-gen:fullfile': mime_uri = act_f['dc-terms:hasFormat'] format_list.append( mime_uri.replace( 'http://purl.org/NET/mediatypes/', '')) else: format_list = self.DC_FORMATS['other'] for mime in format_list: act_xml = etree.SubElement(format_xml, '{' + dc + '}format') act_xml.text = mime subjects_list = [] if 'category' in json_ld: cat = json_ld['category'][0] cat_label = self.get_category_label(cat, json_ld) if cat_label is not False: subjects_list.append(cat_label) if 'dc-terms:subject' in json_ld: if isinstance(json_ld['dc-terms:subject'], list): for subj in json_ld['dc-terms:subject']: if 'label' in subj: subjects_list.append(subj['label']) if len(subjects_list) > 0: for subject in subjects_list: act_xml = etree.SubElement(format_xml, '{' + dc + '}subject') act_xml.text = subject
def get_description_tree(self, entity_obj, depth=1, first_time=True, item_type=False, class_uri=False): """ gets a hierarchy for descriptive predicates and types """ lr = LinkRecursion() if entity_obj.item_type == 'projects': tree = self.make_containment_item(entity_obj) if item_type is not False and class_uri is False: # returns the classes associated with an item_type for a project tree['label'] = tree['label'] + ', ' + item_type tree['children'] = self.get_proj_type_classes_items(entity_obj.uuid, item_type) elif item_type is not False and class_uri is not False: # returns the predicates associated with an item_type and class_uri tree['children'] = self.get_proj_type_class_preds(entity_obj.uuid, item_type, class_uri, True) else: # project root, returns the item_types for the project tree['children'] = self.get_proj_types(entity_obj.uuid) if first_time: output = [] output.append(tree) else: output = tree elif entity_obj.item_type == 'predicates': tree = self.make_containment_item(entity_obj) tree['children'] = [] child_list = lr.get_entity_children(entity_obj.uuid, False) if len(child_list) > 0: for child_uuid in child_list: child_ent = Entity() found = child_ent.dereference(child_uuid) if found: if depth > 1: child = self.get_containment_children(child_ent, depth - 1, False) else: child = self.make_containment_item(child_ent) tree['children'].append(child) elif entity_obj.data_type == 'id': top_types = lr.get_pred_top_rank_types(entity_obj.uuid) for top_type in top_types: uri = top_type['id'] uuid = URImanagement.get_uuid_from_oc_uri(uri) item = False if depth > 1: child_ent = Entity() found = child_ent.dereference(uuid) if found: item = self.get_description_tree(child_ent, depth - 1, False) else: item = LastUpdatedOrderedDict() item['id'] = uuid item['label'] = top_type['label'] item['class_uri'] = 'type' item['class_label'] = 'type' tree['children'].append(item) tree['children'] = self.sort_children_by_label(tree['children']) else: pass if first_time: output = [] output.append(tree) else: output = tree elif entity_obj.item_type == 'types': tree = self.make_containment_item(entity_obj) tree['children'] = [] act_children = lr.get_entity_children(entity_obj.uuid, False) for child_uuid in act_children: if child_uuid != entity_obj.uuid: child_ent = Entity() found = child_ent.dereference(child_uuid) if found: if depth > 1: child = self.get_description_tree(child_ent, depth - 1, False) else: child = self.make_containment_item(child_ent) child['class_uri'] = 'type' child['class_label'] = 'type' tree['children'].append(child) if len(tree['children']) == 0: tree.pop('children', None) else: tree['children'] = self.sort_children_by_label(tree['children']) if first_time: output = [] output.append(tree) else: output = tree else: output = [] return output