def get_identifier_list_variants(self, id_list): """ makes different variants of identifiers for a list of identifiers """ output_list = [] if not isinstance(id_list, list): id_list = [str(id_list)] for identifier in id_list: output_list.append(identifier) if(identifier.startswith('http://') or identifier.startswith('https://')): oc_uuid = URImanagement.get_uuid_from_oc_uri(identifier) if oc_uuid: output_list.append(oc_uuid) prefix_id = URImanagement.prefix_common_uri(identifier) if prefix_id: output_list.append(prefix_id) elif ':' in identifier: full_uri = URImanagement.convert_prefix_to_full_uri(identifier) output_list.append(full_uri) else: # probably an open context uuid or a slug m_cache = MemoryCache() ent = m_cache.get_entity(identifier) if ent: full_uri = ent.uri output_list.append(full_uri) prefix_uri = URImanagement.prefix_common_uri(full_uri) if prefix_uri != full_uri: output_list.append(prefix_uri) return output_list
def __init__(self, request_dict_json=False): self.uuid = False self.uri = False # cannonical uri for the item self.href = False # link to the item in the current deployment self.cite_uri = False # stable / persistent uri self.label = False self.item_type = False self.updated = False self.published = False self.project_href = False # link to the project in current deployment self.project_uri = False # cannonical uri for the project self.project_label = False self.context_href = False # link to parent context in current deployment self.context_uri = False # link to parent context cannonical uri self.context_label = False self.category = False self.latitude = False self.longitude = False self.geojson = False self.early_date = False self.late_date = False self.human_remains_flagged = False # flagged as relating to human remains self.thumbnail_href = False self.thumbnail_uri = False self.thumbnail_scr = False self.preview_scr = False self.fullfile_scr = False self.snippet = False self.cite_uri = False # stable identifier as an HTTP uri self.other_attributes = False # other attributes to the record # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.attribute_hierarchies = {} self.base_url = settings.CANONICAL_HOST rp = RootPath() self.base_url = rp.get_baseurl() self.m_cache = MemoryCache() # memory caching object self.s_cache = SearchGenerationCache( ) # supplemental caching object, specific for searching self.request_dict_json = request_dict_json if request_dict_json is not False: self.request_dict = json.loads(request_dict_json) else: self.request_dict = False self.add_attribute_uris = False if self.request_dict and self.request_dict.get('add-attribute-uris'): self.add_attribute_uris = True self.highlighting = False self.recursive_count = 0 self.min_date = False self.max_date = False self.thumbnail_data = {} self.media_file_data = {} self.string_attrib_data = {}
def add_entity_item_to_act_filter( self, lookup_val, act_filter, is_spatial_context=False, look_up_mapping_dict=None, ): """Looks up a entity item to add to an act_filter""" lookup_val = str(lookup_val) if lookup_val.startswith(configs.RELATED_ENTITY_ID_PREFIX): # Strip off the related property prefix. Note that this # is a related property. lookup_val = lookup_val[len(configs.RELATED_ENTITY_ID_PREFIX):] act_filter['oc-api:related-property'] = True # Map the lookup_val to a mapping dict if look_up_mapping_dict: lookup_val = look_up_mapping_dict.get(lookup_val, lookup_val) m_cache = MemoryCache() items = [] if configs.REQUEST_OR_OPERATOR in lookup_val: lookup_list = lookup_val.split(configs.REQUEST_OR_OPERATOR) else: lookup_list = [lookup_val] for act_val in lookup_list: if is_spatial_context: item = m_cache.get_entity_by_context(act_val) else: item = m_cache.get_entity(act_val) if not item: continue items.append(item) if not len(items): # We didn't find any item entities, so return # the lookup list as the label. act_filter['label'] = ' OR '.join(lookup_list) return act_filter, None # Use all the item labels to make a label. item_labels = [item.label for item in items] act_filter['label'] = ' OR '.join(item_labels) if len(items) == 1: # We only have 1 item, so define it with a # URI and slug. act_filter['rdfs:isDefinedBy'] = items[0].uri act_filter['oc-api:filter-slug'] = items[0].slug return act_filter, item
def get_cache_earliest_date(self): """ Gets and caches the earliest date as a date_time object! """ mc = MemoryCache() cache_key = mc.make_memory_cache_key('early_date', 'manifest') early_date = mc.get_cache_object(cache_key) if early_date is None: sum_man = Manifest.objects\ .filter(published__gt='2001-01-01')\ .aggregate(Min('published')) early_date = sum_man['published__min'] mc.save_cache_object(cache_key, early_date) return early_date
def __init__(self, proj_context_json_ld=None): self.m_cache = MemoryCache() self.context = None self.graph = None self.fail_on_missing_entities = False if not isinstance(proj_context_json_ld, dict): return None if '@context' in proj_context_json_ld: self.context = proj_context_json_ld['@context'] if '@graph' in proj_context_json_ld: self.graph = self.GLOBAL_VOCAB_GRAPH + proj_context_json_ld[ '@graph'] else: self.graph = self.GLOBAL_VOCAB_GRAPH logger.info('Read project graph size: {}'.format(len(self.graph)))
def __init__(self, request_dict=False): rp = RootPath() self.base_url = rp.get_baseurl() self.base_search_link = '/search/' self.base_request = request_dict self.base_request_json = False self.base_r_full_path = False self.spatial_context = False self.testing = settings.DEBUG self.hierarchy_delim = '---' self.partial_param_val_match = False self.remove_start_param = True self.m_cache = MemoryCache( ) # memory caching object # memory caching object self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS for param_key, solr_field in DCterms.DC_META_FIELDS.items(): self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key
def _get_cache_contexts_dict(self, uuids): """Make a dictionary that associates uuids to context paths""" m_cache = MemoryCache() uuids_for_qs = [] uuid_context_dict = {} for uuid in uuids: cache_key = m_cache.make_cache_key(prefix='context-path', identifier=uuid) context_path = m_cache.get_cache_object(cache_key) if context_path is None: uuids_for_qs.append(uuid) else: uuid_context_dict[uuid] = context_path if not len(uuids_for_qs): # Found them all from the cache! # Return without touching the database. return uuid_context_dict # Lookup the remaining geospace objects from a # database query. We order by uuid then reverse # of feature_id so that the lowest feature id is the # thing that actually gets cached. subject_qs = Subject.objects.filter(uuid__in=uuids_for_qs, ) for sub_obj in subject_qs: cache_key = m_cache.make_cache_key(prefix='context-path', identifier=str(sub_obj.uuid)) m_cache.save_cache_object(cache_key, sub_obj.context) uuid_context_dict[sub_obj.uuid] = sub_obj.context return uuid_context_dict
def _make_cache_geospace_obj_dict(self, uuids): """Make a dict of geospace objects keyed by uuid""" m_cache = MemoryCache() uuids_for_qs = [] uuid_geo_dict = {} for uuid in uuids: cache_key = m_cache.make_cache_key(prefix='geospace-obj', identifier=uuid) geo_obj = m_cache.get_cache_object(cache_key) if geo_obj is None: uuids_for_qs.append(uuid) else: uuid_geo_dict[uuid] = geo_obj if not len(uuids_for_qs): # Found them all from the cache! # Return without touching the database. return uuid_geo_dict # Lookup the remaining geospace objects from a # database query. We order by uuid then reverse # of feature_id so that the lowest feature id is the # thing that actually gets cached. geospace_qs = Geospace.objects.filter(uuid__in=uuids_for_qs, ).exclude( ftype__in=['Point', 'point']).order_by('uuid', '-feature_id') for geo_obj in geospace_qs: cache_key = m_cache.make_cache_key(prefix='geospace-obj', identifier=str(geo_obj.uuid)) m_cache.save_cache_object(cache_key, geo_obj) uuid_geo_dict[geo_obj.uuid] = geo_obj return uuid_geo_dict
def __init__(self): self.m_cache = MemoryCache() self.parent_entities = None self.child_entities = None # cache prefix for the json-ldish-parents self.jsonldish_p_prefix = 'json-ldish-parents-{}' # cache prefix for list of parents self.p_prefix = 'lr-parents' # cache prefix for children of an item self.children_prefix = 'lr-children-{}' # cache prefix for full tree of child items self.child_tree_prefix = 'lr-child-tree-{}'
def __init__(self, request_dict_json=False): self.uuid = False self.uri = False # cannonical uri for the item self.href = False # link to the item in the current deployment self.cite_uri = False # stable / persistent uri self.label = False self.item_type = False self.updated = False self.published = False self.project_href = False # link to the project in current deployment self.project_uri = False # cannonical uri for the project self.project_label = False self.context_href = False # link to parent context in current deployment self.context_uri = False # link to parent context cannonical uri self.context_label = False self.category = False self.latitude = False self.longitude = False self.geojson = False self.early_date = False self.late_date = False self.human_remains_flagged = False # flagged as relating to human remains self.thumbnail_href = False self.thumbnail_uri = False self.thumbnail_scr = False self.preview_scr = False self.fullfile_scr = False self.snippet = False self.cite_uri = False # stable identifier as an HTTP uri self.other_attributes = False # other attributes to the record # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.attribute_hierarchies = {} self.base_url = settings.CANONICAL_HOST rp = RootPath() self.base_url = rp.get_baseurl() self.m_cache = MemoryCache() # memory caching object self.s_cache = SearchGenerationCache() # supplemental caching object, specific for searching self.request_dict_json = request_dict_json if request_dict_json is not False: self.request_dict = json.loads(request_dict_json) else: self.request_dict = False self.highlighting = False self.recursive_count = 0 self.min_date = False self.max_date = False self.thumbnail_data = {} self.media_file_data = {} self.string_attrib_data = {}
def __init__(self, response_dict_json=False): rp = RootPath() self.base_url = rp.get_baseurl() self.uuids = [] self.uris = [] self.m_cache = MemoryCache() # memory caching object self.s_cache = SearchGenerationCache( ) # supplemental caching object, specific for searching self.response_dict_json = response_dict_json self.highlighting = False # make values to these fields "flat" not a list self.flatten_rec_fields = True self.total_found = False self.rec_start = False self.min_date = False self.max_date = False # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.do_media_thumbs = True # get thumbnails for records self.get_all_media = False # get links to all media files for an item
class SearchGenerationCache(): """ methods for using the Reddis cache to streamline making JSON-LD search results """ def __init__(self, cannonical_uris=False): self.m_cache = MemoryCache() def get_dtypes(self, entity_uri): """ returns an entity data type """ cache_key = self.m_cache.make_cache_key('data-types', entity_uri) dtypes = self.m_cache.get_cache_object(cache_key) if dtypes is None: dtypes = self._get_dtypes_db(entity_uri) if dtypes: self.m_cache.save_cache_object(cache_key, dtypes) return dtypes def _get_dtypes_db(self, entity_uri): """ returns an entity data type """ # haven't found it yet, so look in database lequiv = LinkEquivalence() return lequiv.get_data_types_from_object(entity_uri)
def get_valid_context_slugs(paths_list): '''Takes a list of context paths and returns a list of slugs for valid paths, ignoring invalid paths. :param list paths_list: List of spatial context path strings. ''' m_cache = MemoryCache() paths_list = list(paths_list) url_fixes = [] for context in paths_list: for url_issue, rep in {'+': ' ', '%20': ' '}.items(): if not url_issue in context: continue url_fix_context = context.replace(url_issue, rep) if url_fix_context in paths_list: # skip, we already have this context in the paths_list continue url_fixes.append(url_fix_context) # Add a list of url_fixes that have substitutions that maybe # needed for replace problematic URL encoding to successfully # lookup items. paths_list += url_fixes valid_context_slugs = [] for context in list(paths_list): # Verify that the contexts are valid # find and save the entity to memory entity = m_cache.get_entity_by_context(context) if not entity: # Skip, we couldn't find an entity for # this context path continue if entity.slug in valid_context_slugs: # Skip, we already have this entity slug in our valid list. continue valid_context_slugs.append(entity.slug) return valid_context_slugs
def get_project_date_range(self, project_uuid): """ gets a project date range """ mem = MemoryCache() key = mem.make_cache_key('proj-chrono', project_uuid) date_range = mem.get_cache_object(key) if not isinstance(date_range, dict): date_range = self.get_project_date_range_db(project_uuid) mem.save_cache_object(key, date_range) return date_range
def get_project_geo_meta(self, project_uuid): """ gets a geo_meta object for a project """ mem = MemoryCache() key = mem.make_cache_key('proj-geo', project_uuid) geo_meta = mem.get_cache_object(key) if geo_meta is None: geo_meta = self.get_project_geo_meta_db(project_uuid) mem.save_cache_object(key, geo_meta) return geo_meta
class SearchGenerationCache(): """ methods for using the Reddis cache to streamline making JSON-LD search results """ def __init__(self, cannonical_uris = False): self.m_cache = MemoryCache() def get_dtypes(self, entity_uri): """ returns an entity data type """ cache_key = self.m_cache.make_cache_key('data-types', entity_uri) dtypes = self.m_cache.get_cache_object(cache_key) if dtypes is None: dtypes = self._get_dtypes_db(entity_uri) if dtypes: self.m_cache.save_cache_object(cache_key, dtypes) return dtypes def _get_dtypes_db(self, entity_uri): """ returns an entity data type """ # haven't found it yet, so look in database lequiv = LinkEquivalence() return lequiv.get_data_types_from_object(entity_uri)
def __init__(self, request_dict=False): rp = RootPath() self.base_url = rp.get_baseurl() self.base_search_link = '/search/' self.base_request = request_dict self.base_request_json = False self.base_r_full_path = False self.spatial_context = False self.testing = settings.DEBUG self.hierarchy_delim = '---' self.partial_param_val_match = False self.remove_start_param = True self.mem_cache_obj = MemoryCache() # memory caching object self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS for param_key, solr_field in DCterms.DC_META_FIELDS.items(): self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key
def get_all_uuids_related_to_gazetteers(self, all_gaz_annos=None): """ gets ALL subject entities related to gazetteer entities """ mc = MemoryCache() cache_id = mc.make_cache_key('gaz', 'uuids_all_gaz') uuids_all_gaz = mc.get_cache_object(cache_id) if uuids_all_gaz is None: if all_gaz_annos is None: all_gaz_annos = self.get_all_related_to_gazetteers() uuids_all_gaz = { 'subjects': {}, 'documents': {}, 'media': {}, 'projects': {}, 'types': {} } for gaz_anno in all_gaz_annos: hash_id = gaz_anno.hash_id gaz_ent_uri = gaz_anno.object_uri key = gaz_anno.subject_type if hash_id not in uuids_all_gaz[key]: gaz_ref = { 'uuid': gaz_anno.subject, 'item_type': gaz_anno.subject_type, 'gaz_ent_uri': gaz_ent_uri } if key == 'subjects': # get subjects specific information for the gaz_ref gaz_ref = self.subjects_specific_gaz_ref( gaz_anno.subject, gaz_ent_uri) uuids_all_gaz[key][hash_id] = gaz_ref # Gazeteer linked types describe other items that we want to annotate # Look up the items described by a type so we can add to the # gazetteer described items if gaz_anno.subject_type == 'types': rel_asserts = Assertion.objects\ .filter(subject_type__in=self.OC_OA_TARGET_TYPES, object_uuid=gaz_anno.subject) for rel_assert in rel_asserts: key = rel_assert.subject_type if hash_id not in uuids_all_gaz[key]: gaz_ref = { 'uuid': rel_assert.uuid, 'item_type': rel_assert.subject_type, 'gaz_ent_uri': gaz_ent_uri } if key == 'subjects': # get subjects specific information gaz_ref = self.subjects_specific_gaz_ref( rel_assert.uuid, gaz_ent_uri) uuids_all_gaz[key][hash_id] = gaz_ref # save this hard work to the cache mc.save_cache_object(cache_id, uuids_all_gaz) return uuids_all_gaz
def get_used_gazetteer_entities(self): """ gets entitites in gazetteer vocabularies that are actually being used. NOTE! This checks the memnory cache first! """ mc = MemoryCache() cache_id = mc.make_cache_key('gaz', 'used_gazetteer_ents') act_gaz_list = mc.get_cache_object(cache_id) if act_gaz_list is None: # cache was empty, so get this from the database act_gaz_list = self.get_used_gazetteer_entities_db() mc.save_cache_object(cache_id, act_gaz_list) return act_gaz_list
def get_geo_overlays(self): """Gets geo overlays for an item identified by uuid.""" m_cache = MemoryCache() cache_key = m_cache.make_cache_key('geo-layers', self.uuid) geo_overlays = m_cache.get_cache_object(cache_key) if geo_overlays is not None: self.geo_overlays = geo_overlays return self.geo_overlays else: geo_overlays = self.get_geo_overlays_db() m_cache.save_cache_object(cache_key, geo_overlays) return self.geo_overlays
def get_all_related_to_gazetteers(self): """ gets ALL subject entities related to gazetteer entities """ mc = MemoryCache() cache_id = mc.make_cache_key('gaz', 'all_gaz_annos') all_gaz_annos = mc.get_cache_object(cache_id) if all_gaz_annos is None: subject_types = self.OC_OA_TARGET_TYPES subject_types.append('types') act_gaz_list = self.get_used_gazetteer_entities() all_gaz_annos = LinkAnnotation.objects\ .filter(subject_type__in=subject_types, object_uri__in=act_gaz_list) mc.save_cache_object(cache_id, all_gaz_annos) return all_gaz_annos
def get_cache_earliest_date(self): """ Gets and caches the earliest date as a date_time object! """ mc = MemoryCache() cache_key = mc.make_cache_key('early_date', 'manifest') early_date = mc.get_cache_object(cache_key) if early_date is None: sum_man = Manifest.objects\ .filter(published__gt='2001-01-01')\ .aggregate(Min('published')) early_date = sum_man['published__min'] mc.save_cache_object(cache_key, early_date) return early_date
def __init__(self, response_dict_json=False): rp = RootPath() self.base_url = rp.get_baseurl() self.uuids = [] self.uris = [] self.mem_cache_obj = MemoryCache() # memory caching object self.response_dict_json = response_dict_json self.highlighting = False # make values to these fields "flat" not a list self.flatten_rec_fields = True self.total_found = False self.rec_start = False self.min_date = False self.max_date = False # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.do_media_thumbs = True # get thumbnails for records self.get_all_media = False # get links to all media files for an item
def __init__(self, response_dict_json): rp = RootPath() self.base_url = rp.get_baseurl() self.m_cache = MemoryCache() # memory caching object self.response_dict_json = response_dict_json self.response_dict = json.loads(response_dict_json) self.highlighting = False # make values to these fields "flat" not a list self.flatten_rec_fields = True self.geojson_recs = [] self.non_geo_recs = [] self.total_found = False self.rec_start = False self.min_date = False self.max_date = False # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.do_complex_geo = False # get complex (Polygons, etc.) geospatial data from database self.do_media_thumbs = True # get thumbnails for records self.get_all_media = False # get links to all media files for an item
def get_jsonldish_parents(self, uuid, add_original=True): """Gets parent projects for a project. Returns a list of dictionary objects similar to JSON-LD expectations This is useful for faceted search """ m_cache = MemoryCache() cache_key = m_cache.make_cache_key( 'proj-par-jsonldish_{}'.format(add_original), uuid ) output = m_cache.get_cache_object(cache_key) if output is None: output = self._db_get_jsonldish_parents( uuid, add_original=add_original ) m_cache.save_cache_object(cache_key, output) return output
def get_containment_parent_slug(slug): '''Takes a slug and returns the slug of its parent. Returns 'root' if a slug has no parent. :param str slug: Slug identifying a subjects item. ''' m_cache = MemoryCache() cache_key = m_cache.make_cache_key('contain-par-slug', slug) parent_slug = m_cache.get_cache_object(cache_key) if parent_slug is None: contain_obj = Containment() # Because it seems to introduce memory errors, turn off # caching for this class instance. contain_obj.use_cache = False parent_slug = contain_obj.get_parent_slug_by_slug(slug) m_cache.save_cache_object(cache_key, parent_slug) if parent_slug: return parent_slug return 'root'
class RecordProperties(): """ Methods to make properties for individual record items useful for making geospatial feature records or lists of items without geospatial data """ ATTRIBUTE_DELIM = '; ' # delimiter for multiple attributes def __init__(self, request_dict_json=False): self.uuid = False self.uri = False # cannonical uri for the item self.href = False # link to the item in the current deployment self.cite_uri = False # stable / persistent uri self.label = False self.item_type = False self.updated = False self.published = False self.project_href = False # link to the project in current deployment self.project_uri = False # cannonical uri for the project self.project_label = False self.context_href = False # link to parent context in current deployment self.context_uri = False # link to parent context cannonical uri self.context_label = False self.category = False self.latitude = False self.longitude = False self.geojson = False self.early_date = False self.late_date = False self.human_remains_flagged = False # flagged as relating to human remains self.thumbnail_href = False self.thumbnail_uri = False self.thumbnail_scr = False self.preview_scr = False self.fullfile_scr = False self.snippet = False self.cite_uri = False # stable identifier as an HTTP uri self.other_attributes = False # other attributes to the record # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.attribute_hierarchies = {} self.base_url = settings.CANONICAL_HOST rp = RootPath() self.base_url = rp.get_baseurl() self.m_cache = MemoryCache() # memory caching object self.s_cache = SearchGenerationCache() # supplemental caching object, specific for searching self.request_dict_json = request_dict_json if request_dict_json is not False: self.request_dict = json.loads(request_dict_json) else: self.request_dict = False self.highlighting = False self.recursive_count = 0 self.min_date = False self.max_date = False self.thumbnail_data = {} self.media_file_data = {} self.string_attrib_data = {} def parse_solr_record(self, solr_rec): """ Parses a solr rec object """ if isinstance(solr_rec, dict): self.get_item_basics(solr_rec) self.get_citation_uri(solr_rec) self.get_lat_lon(solr_rec) self.get_category(solr_rec) self.get_project(solr_rec) self.get_context(solr_rec) self.get_time(solr_rec) # get time information, limiting date ranges to query constaints self.get_thumbnail(solr_rec) self.get_media_files(solr_rec) self.get_snippet(solr_rec) # get snippet of highlighted text self.get_attributes(solr_rec) # get non-standard attributes self.get_string_attributes(solr_rec) # get non-standard string attributes def get_item_basics(self, solr_rec): """ get basic metadata for an item """ output = False if isinstance(solr_rec, dict): if 'uuid' in solr_rec: self.uuid = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label']) if id_parts is not False: output = True self.uri = self.make_url_from_val_string(id_parts['uri'], True) self.href = self.make_url_from_val_string(id_parts['uri'], False) item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True) self.item_type = item_type_output['item_type'] self.label = id_parts['label'] if 'updated' in solr_rec: self.updated = solr_rec['updated'] if 'published' in solr_rec: self.published = solr_rec['published'] if 'human_remains' in solr_rec: # is the record flagged as related to human remains ?human_remains if solr_rec['human_remains'] > 0: self.human_remains_flagged = True return output def get_snippet(self, solr_rec): """ get a text highlighting snippet """ if isinstance(self.highlighting, dict): if self.uuid is False: if 'uuid' in solr_rec: self.uuid = solr_rec['uuid'] if self.uuid in self.highlighting: if 'text' in self.highlighting[self.uuid]: text_list = self.highlighting[self.uuid]['text'] self.snippet = ' '.join(text_list) # some processing to remove fagments of HTML markup. self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]') self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]') try: self.snippet = '<div>' + self.snippet + '</div>' self.snippet = lxml.html.fromstring(self.snippet).text_content() self.snippet = strip_tags(self.snippet) except: self.snippet = strip_tags(self.snippet) self.snippet = self.snippet.replace('[[[[mark]]]]', '<em>') self.snippet = self.snippet.replace('[[[[/mark]]]]', '</em>') def get_citation_uri(self, solr_rec): """ gets the best citation / persistent uri for the item """ if 'persistent_uri' in solr_rec: for p_uri in solr_rec['persistent_uri']: self.cite_uri = p_uri if 'dx.doi.org' in p_uri: break # stop looking once we have a DOI, the best def get_lat_lon(self, solr_rec): """ gets latitute and longitude information """ if 'discovery_geolocation' in solr_rec: geo_strings = solr_rec['discovery_geolocation'] geo_coords_str = geo_strings.split(',') # NOT geojson ording, since solr uses lat/lon ordering self.latitude = float(geo_coords_str[0]) self.longitude = float(geo_coords_str[1]) def get_category(self, solr_rec): """ Gets the most specific category for the item """ self.recursive_count = 0 cat_hierarchy = self.get_category_hierarchy(solr_rec) if len(cat_hierarchy) > 0: self.category = cat_hierarchy[-1]['label'] def get_context(self, solr_rec): """ Get the most specific context parent for the record """ self.recursive_count = 0 contexts = self.extract_hierarchy(solr_rec, SolrDocument.ROOT_CONTEXT_SOLR, '___context', []) if len(contexts) > 0: self.context_label = self.make_context_path_label(contexts) self.context_uri = self. make_context_link(contexts, True) self.context_href = self. make_context_link(contexts, False) def get_project(self, solr_rec): """ Get the most specific project for the record """ self.recursive_count = 0 projects = self.extract_hierarchy(solr_rec, SolrDocument.ROOT_PROJECT_SOLR, '___project', []) if len(projects) > 0: self.project_label = projects[-1]['label'] self.project_uri = self.make_url_from_val_string(projects[-1]['uri'], True) self.project_href = self.make_url_from_val_string(projects[-1]['uri'], False) def get_time(self, solr_rec): """ parses time information """ early_list = False late_list = False if 'form_use_life_chrono_earliest' in solr_rec: early_list = solr_rec['form_use_life_chrono_earliest'] if 'form_use_life_chrono_latest' in solr_rec: late_list = solr_rec['form_use_life_chrono_latest'] if isinstance(early_list, list): date_list = early_list else: date_list = [] if isinstance(late_list, list): date_list += late_list if len(date_list) > 0: min_max = self.get_list_min_max(date_list) self.early_date = min(min_max) self.late_date = max(min_max) def get_list_min_max(self, date_list): """ Returns the minimum and maximum dates from a date list, constrained by preset min and max dates """ min_date = False max_date = False # print(str(date_list)) if isinstance(date_list, list): date_list.sort() for date in date_list: if self.min_date is not False: if date >= self.min_date \ and min_date is False: min_date = date if self.max_date is not False: if date <= self.max_date: max_date = date if min_date is False: min_date = self.min_date if max_date is False: max_date = self.max_date return [min_date, max_date] def get_thumbnail(self, solr_rec): """ get media record and thumbnai if it exists """ if 'uuid' in solr_rec: uuid = solr_rec['uuid'] if uuid in self.thumbnail_data: if self.thumbnail_data[uuid] is not False: self.thumbnail_href = self.thumbnail_data[uuid]['href'] self.thumbnail_uri = self.thumbnail_data[uuid]['uri'] self.thumbnail_scr = self.thumbnail_data[uuid]['scr'] rp = RootPath() self.thumbnail_scr = rp.convert_to_https(self.thumbnail_scr) else: # did not precache thumbnail data, get an indivitual record self.get_thumbnail_from_database(solr_rec) def get_media_files(self, solr_rec): """ get media record and thumbnai if it exists """ if 'uuid' in solr_rec: uuid = solr_rec['uuid'] if uuid in self.media_file_data: if self.media_file_data[uuid] is not False: rp = RootPath() for file_type, file_uri in self.media_file_data[uuid].items(): if file_type == 'oc-gen:thumbnail': self.thumbnail_scr = rp.convert_to_https(file_uri) elif file_type == 'oc-gen:preview': self.preview_scr = rp.convert_to_https(file_uri) elif file_type == 'oc-gen:fullfile': self.fullfile_scr = rp.convert_to_https(file_uri) def get_thumbnail_from_database(self, solr_rec): """ get media record and thumbnail, if it exists """ if 'uuid' in solr_rec: uuid = solr_rec['uuid'] thumb = [] if self.item_type != 'media': media_item = Assertion.objects\ .filter(uuid=uuid, object_type='media')[:1] if len(media_item) > 0: muuid = media_item[0].object_uuid thumb = Mediafile.objects\ .filter(uuid=muuid, file_type='oc-gen:thumbnail')[:1] else: # do this for media items muuid = uuid thumb = Mediafile.objects\ .filter(uuid=uuid, file_type='oc-gen:thumbnail')[:1] if len(thumb) > 0: self.thumbnail_href = self.base_url + '/media/' + muuid self.thumbnail_uri = settings.CANONICAL_HOST + '/media/' + muuid self.thumbnail_scr = thumb[0].file_uri def get_category_hierarchy(self, solr_rec): """ gets the most specific category informtation about an item """ cat_hierarchy = [] if 'item_type' in solr_rec: item_type = solr_rec['item_type'][0] root_cat_field = 'oc_gen_' + item_type + '___pred_id' cat_hierarchy = self.extract_hierarchy(solr_rec, root_cat_field, '___pred', []) return cat_hierarchy """ The following seciton of code processes non-default attributes for records """ def get_attributes(self, solr_rec): """ gets attributes for a record, based on the predicates requested in the search and optional predicates passed by a client with a GET request with parameter 'attributes' """ qm = QueryMaker() solr_field_entities = {} for attribute in self.rec_attributes: entity = self.m_cache.get_entity(attribute) if entity: prop_slug = entity.slug # check to make sure we have the entity data type for linked fields if entity.data_type is False and entity.item_type == 'uri': dtypes = self.s_cache.get_dtypes(entity.uri) if isinstance(dtypes, list): # set te data type and the act-field # print('Found for ' + prop_slug + ' ' + dtypes[0]) entity.data_type = dtypes[0] field_parts = qm.make_prop_solr_field_parts(entity) solr_field = field_parts['prefix'] + '___pred_' + field_parts['suffix'] # print('Found: ' + solr_field) # extract children of the solr_field so we know if # we have the most specific attributes, then we can get # values for the most specific attributes self.extract_attribute_children(solr_rec, solr_field) self.clean_attribute_hiearchies() if isinstance(self.attribute_hierarchies, dict): self.other_attributes = [] for field_slug_key, values in self.attribute_hierarchies.items(): entity = self.m_cache.get_entity(field_slug_key) if entity: attribute_dict = LastUpdatedOrderedDict() attribute_dict['property'] = entity.label attribute_dict['values_list'] = [] attribute_dict['value'] = '' string_val = False delim = '' for val in values: if isinstance(val, str): string_val = True parsed_val = self.parse_solr_value_parts(val) attribute_dict["values_list"].append(parsed_val['label']) attribute_dict['value'] += delim + str(parsed_val['label']) else: attribute_dict["values_list"].append(val) attribute_dict['value'] += delim + str(val) delim = self.ATTRIBUTE_DELIM if len(values) == 1 \ and string_val is False: attribute_dict['value'] = values[0] self.other_attributes.append(attribute_dict) def get_string_attributes(self, solr_rec): """ gets string attributes for a solr rec, from a previous database query needed because solr does not cache string field data """ if isinstance(self.string_attrib_data, dict): # now add predicate attributes for string predicates, from the database if 'uuid' in solr_rec and 'data' in self.string_attrib_data: uuid = solr_rec['uuid'] if uuid in self.string_attrib_data['data']: item_data = self.string_attrib_data['data'][uuid] for pred_uuid, values_list in item_data.items(): act_attribute = self.string_attrib_data['pred_ents'][pred_uuid] act_attribute['values_list'] = values_list act_attribute['value'] = self.ATTRIBUTE_DELIM.join(values_list) self.other_attributes.append(act_attribute) def prevent_attribute_key_collision(self, item_prop_dict, prop_key): """ checks to make sure there's no collision between the prop_key and the dict that it will be added to """ i = 2 output_prop_key = prop_key while output_prop_key in item_prop_dict: output_prop_key = prop_key + '[' + str(i) + ']' i += 1 return output_prop_key def clean_attribute_hiearchies(self): """ some post-processing to make sure we have clean attribute hierarchies """ if isinstance(self.attribute_hierarchies, dict): # print('check: ' + str(self.attribute_hierarchies)) temp_attribute_hierarchies = self.attribute_hierarchies clean_attribute_hiearchies = {} for solr_field_key, field_char in self.attribute_hierarchies.items(): if field_char['most-specific']: par_field_ex = solr_field_key.split('___') # last two parts make the suffix, a pred-slug[-2] and a field type [-1] pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1] specific_ok = True for val in field_char['values']: if isinstance(val, str): # print('check:' + solr_field_key + ' val: ' + val) parsed_val = self.parse_solr_value_parts(val) check_field = parsed_val['slug'].replace('-', '_') check_field += '___pred_' + parsed_val['data_type'] if check_field in temp_attribute_hierarchies: # note a field is NOT at the most specific level specific_ok = False else: # now check a version with the predicate as part of # the solr field check_field = parsed_val['slug'].replace('-', '_') check_field += pred_suffix if check_field in temp_attribute_hierarchies: # note a field is NOT at the most specific level specific_ok = False if specific_ok: # ok to add # print('checked OK: ' + solr_field_key) clean_attribute_hiearchies[solr_field_key] = field_char # now that we got rid of problem fields, lets sort these for consistent # rendering self.attribute_hierarchies = LastUpdatedOrderedDict() keys = LastUpdatedOrderedDict() # order of key types, we want id fields, followed by numeric then date key_types = ['___pred_id', '___pred_numeric', '___pred_date'] for key_type in key_types: keys[key_type] = [] for solr_field_key, field_char in clean_attribute_hiearchies.items(): if key_type in solr_field_key: keys[key_type].append(solr_field_key) # sort alphabetically. Slugs useful, since they will cluster predicates # from similar vocabularies keys[key_type].sort() for key in keys[key_type]: field_char = clean_attribute_hiearchies[key] field_ex = key.split('___') # the penultimate part is the predicate field_slug = field_ex[-2].replace('_', '-') if field_slug not in self.attribute_hierarchies: self.attribute_hierarchies[field_slug] = [] for val in field_char['values']: if val not in self.attribute_hierarchies[field_slug]: self.attribute_hierarchies[field_slug].append(val) def extract_attribute_children(self, solr_rec, solr_field_key): """ extracts ALL children from the hiearchy of a solr_field_key """ is_field = False if solr_field_key not in self.attribute_hierarchies: # so we don't look at the same thing twice! if solr_field_key in solr_rec: is_field = True field_char = {'most-specific': False, 'values': []} if '___pred_numeric' in solr_field_key \ or '___pred_numeric' in solr_field_key: field_char['most-specific'] = True field_char['values'] = solr_rec[solr_field_key] elif '___pred_id' in solr_field_key: # make a suffix for the par_field_ex = solr_field_key.split('___') # last two parts make the suffix, a pred-slug[-2] and a field type [-1] pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1] childless_children = [] for child_val in solr_rec[solr_field_key]: # print('Child: ' + solr_field_key + ': ' + child_val) parsed_path_item = self.parse_solr_value_parts(child_val) new_field_prefix = parsed_path_item['slug'].replace('-', '_') new_field_key = new_field_prefix + '___pred_' + parsed_path_item['data_type'] if parsed_path_item['data_type'] == 'id': child_is_field = self.extract_attribute_children(solr_rec, new_field_key) if child_is_field is False: # now check an alternative combining the child # slug with the predicate of the parent new_field_key = new_field_prefix + pred_suffix # print('check: ' + new_field_key) child_is_field = self.extract_attribute_children(solr_rec, new_field_key) if child_is_field is False: childless_children.append(child_val) if len(childless_children) > 0: field_char['most-specific'] = True field_char['values'] = childless_children else: pass self.attribute_hierarchies[solr_field_key] = field_char return is_field def extract_hierarchy(self, solr_rec, facet_field_key, facet_suffix, hierarchy=[], pred_field=False): """ extracts a hierarchy from a solr_record. The output is a list starting with the most general parent of the hiearchy, then going to the most specific This is a recursive function and default / starts with the root of the hiearchy as the facet_field_key This only follows a single path (not multiple paths) """ alt_facet_field_key = facet_field_key if pred_field is not False: # do this to allow search of hiarchy in a named # predicate field f_parts = facet_field_key.split('___') if len(f_parts) == 2: alt_f_parts = [f_parts[0], pred_field.replace('-', '_'), f_parts[1]] alt_facet_field_key = '___'.join(alt_f_parts) # print('Check: ' + facet_field_key + ', ' + alt_facet_field_key) if (facet_field_key in solr_rec or alt_facet_field_key in solr_rec)\ and self.recursive_count < 20: self.recursive_count += 1 if facet_field_key in solr_rec: path_item_val = solr_rec[facet_field_key][0] else: path_item_val = solr_rec[alt_facet_field_key][0] parsed_path_item = self.parse_solr_value_parts(path_item_val) if isinstance(parsed_path_item, dict): hierarchy.append(parsed_path_item) new_facet_field = parsed_path_item['slug'].replace('-', '_') new_facet_field += facet_suffix + '_' + parsed_path_item['data_type'] # print('New hierarchy field: ' + new_facet_field) hierarchy = self.extract_hierarchy(solr_rec, new_facet_field, facet_suffix, hierarchy) return hierarchy def make_context_path_label(self, contexts): """ Makes a '/' delimited context path for easy human readability """ context_path = False if len(contexts) > 0: context_labels = [] for context in contexts: context_labels.append(context['label']) context_path = '/'.join(context_labels) return context_path def make_context_link(self, contexts, cannonical=False): """ makes a URI for a context """ context_uri = False if len(contexts) > 0: context_uri = self.make_url_from_val_string(contexts[-1]['uri'], cannonical) return context_uri def make_url_from_val_string(self, partial_url, use_cannonical=True): """ parses a solr value if it has '___' delimiters, to get the URI part string. if it's already a URI part, it makes a URL """ if use_cannonical: base_url = settings.CANONICAL_HOST else: base_url = self.base_url solr_parts = self.parse_solr_value_parts(partial_url) if isinstance(solr_parts, dict): partial_url = solr_parts['uri'] if 'http://' not in partial_url \ and 'https://' not in partial_url: url = base_url + partial_url else: url = partial_url return url def add_record_fields(self): """ adds fields to include in the GeoJSON properties """ if 'rec-field' in self.response_dict: raw_rec_fields = self.response_dict['rec-field'][0] if ',' in raw_rec_fields: self.record_fields = raw_rec_fields.split(',') else: self.record_fields = [raw_rec_fields] else: self.record_fields = [] return self.record_fields def parse_solr_value_parts(self, solr_value): """ parses a solr_value string into slug, solr-data-type, uri, and label parts """ output = False if isinstance(solr_value, str): if '___' in solr_value: solr_ex = solr_value.split('___') if len(solr_ex) == 4: output = {} output['slug'] = solr_ex[0] output['data_type'] = solr_ex[1] output['uri'] = solr_ex[2] output['label'] = solr_ex[3] else: output = solr_value else: output = solr_value return output def get_solr_record_uuid_type(self, solr_rec): """ get item uuid, label, and type from a solr_rec """ output = False if isinstance(solr_rec, dict): output = {'uuid': False, 'label': False, 'item_type': False} if 'uuid' in solr_rec: output['uuid'] = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label']) if id_parts is not False: uri = self.make_url_from_val_string(id_parts['uri'], True) item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True) output['item_type'] = item_type_output['item_type'] output['label'] = id_parts['label'] return output def get_key_val(self, key, dict_obj): """ returns the value associated with a key, if the key exists else, none """ output = None if isinstance(dict_obj, dict): if key in dict_obj: output = dict_obj[key] return output
def __init__(self): self.geojson_ld = False self.raw_related_labels = {} self.m_cache = MemoryCache() # memory caching object
def projects_html_view(request, spatial_context=None): """ returns HTML representation of projects search """ mem_cache_obj = MemoryCache() mem_cache_obj.ping_redis_server() rp = RootPath() base_url = rp.get_baseurl() rd = RequestDict() request_dict_json = rd.make_request_dict_json(request, spatial_context) if rd.security_ok is False: template = loader.get_template('400.html') context = RequestContext(request, {'abusive': True}) return HttpResponse(template.render(context), status=400) elif rd.do_bot_limit: # redirect bot requests away from faceted search where # they can negatively impact performance cache_control(no_cache=True) return redirect('/projects-search/', permanent=False) else: # url and json_url neeed for view templating url = request.get_full_path() if 'http://' not in url \ and 'https://' not in url: url = base_url + url if '?' in url: json_url = url.replace('?', '.json?') else: json_url = url + '.json' # see if search results are cached. this is not done # with a view decorator, because we want to handle bots differently db_cache = DatabaseCache() cache_key = db_cache.make_cache_key('projects-search', request_dict_json) if rd.refresh_cache: # the request wanted to refresh the cache db_cache.remove_cache_object(cache_key) # get the search result JSON-LD, if it exists in cache json_ld = db_cache.get_cache_object(cache_key) if json_ld is None: # cached result is not found, so make it with a new search solr_s = SolrSearch() solr_s.is_bot = rd.is_bot # True if bot detected solr_s.do_bot_limit = rd.do_bot_limit # Toggle limits on facets for bots solr_s.mem_cache_obj = mem_cache_obj solr_s.do_context_paths = False solr_s.item_type_limit = 'projects' if solr_s.solr is not False: response = solr_s.search_solr(request_dict_json) mem_cache_obj = solr_s.mem_cache_obj # reused cached memory items m_json_ld = MakeJsonLd(request_dict_json) m_json_ld.base_search_link = '/projects-search/' # share entities already looked up. Saves database queries m_json_ld.mem_cache_obj = mem_cache_obj m_json_ld.request_full_path = request.get_full_path() m_json_ld.spatial_context = spatial_context json_ld = m_json_ld.convert_solr_json(response.raw_content) # now cache the resulting JSON-LD db_cache.save_cache_object(cache_key, json_ld) if json_ld is not None: req_neg = RequestNegotiation('text/html') req_neg.supported_types = ['application/json', 'application/ld+json', 'application/vnd.geo+json'] if 'HTTP_ACCEPT' in request.META: req_neg.check_request_support(request.META['HTTP_ACCEPT']) if 'json' in req_neg.use_response_type: # content negotiation requested JSON or JSON-LD recon_obj = Reconciliation() json_ld = recon_obj.process(request.GET, json_ld) return HttpResponse(json.dumps(json_ld, ensure_ascii=False, indent=4), content_type=req_neg.use_response_type + "; charset=utf8") else: # now make the JSON-LD into an object suitable for HTML templating st = SearchTemplate(json_ld) st.process_json_ld() p_aug = ProjectAugment(json_ld) p_aug.process_json_ld() template = loader.get_template('search/view.html') context = RequestContext(request, {'st': st, 'item_type': 'projects', 'base_search_link': m_json_ld.base_search_link, 'p_aug': p_aug, 'url': url, 'json_url': json_url, 'base_url': base_url}) if req_neg.supported: return HttpResponse(template.render(context)) else: # client wanted a mimetype we don't support return HttpResponse(req_neg.error_message, content_type=req_neg.use_response_type + "; charset=utf8", status=415) else: cache_control(no_cache=True) template = loader.get_template('500.html') context = RequestContext(request, {'error': 'Solr Connection Problem'}) return HttpResponse(template.render(context), status=503)
class FilterLinks(): BASE_SOLR_FIELD_PARAM_MAPPINGS = \ {'___project_id': 'proj', '___context_id': 'path', 'obj_all___biol_term_hastaxonomy___pred_id': 'reconcile', '___pred_': 'prop', 'item_type': 'type'} def __init__(self, request_dict=False): rp = RootPath() self.base_url = rp.get_baseurl() self.base_search_link = '/search/' self.base_request = request_dict self.base_request_json = False self.base_r_full_path = False self.spatial_context = False self.testing = settings.DEBUG self.hierarchy_delim = '---' self.partial_param_val_match = False self.remove_start_param = True self.mem_cache_obj = MemoryCache() # memory caching object self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS for param_key, solr_field in DCterms.DC_META_FIELDS.items(): self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key def make_request_urls(self, new_rparams): """ makes request urls from the new request object """ output = {} output['html'] = self.make_request_url(new_rparams) output['json'] = self.make_request_url(new_rparams, '.json') output['atom'] = self.make_request_url(new_rparams, '.atom') return output def make_request_url(self, new_rparams, doc_format=''): """ makes request urls from the new request object default doc_format is '' (HTML) """ url = self.base_url + self.base_search_link if 'path' in new_rparams: if new_rparams['path'] is not None \ and new_rparams['path'] is not False: # context_path = iri_to_uri(new_rparams['path']) context_path = new_rparams['path'] context_path = context_path.replace(' ', '+') url += context_path url += doc_format param_sep = '?' for param, param_vals in new_rparams.items(): if param != 'path': for val in param_vals: quote_val = quote_plus(val) quote_val = quote_val.replace('%7BSearchTerm%7D', '{SearchTerm}') url += param_sep + param + '=' + quote_val param_sep = '&' return url def make_request_sub(self, old_request_dict, rem_param_key, rem_param_val, sub_param_val=None): """ makes a dictionary object for request parameters WITHOUT the current fparam_key and fparam_vals """ filter_request = LastUpdatedOrderedDict() for ch_param_key, ch_param_vals in old_request_dict.items(): if ch_param_key != rem_param_key: # a different parameter than the one in the filter, so add filter_request[ch_param_key] = ch_param_vals else: if rem_param_key != 'path' and len(ch_param_vals) > 0: filter_request[ch_param_key] = [] for ch_param_val in ch_param_vals: if rem_param_val != ch_param_val: # the filter value for this key is not the same # as the check value for this key, so add # to the filter request filter_request[ch_param_key].append(ch_param_val) else: if sub_param_val is not None: # put in the substitute value filter_request[ch_param_key].append(sub_param_val) return filter_request def add_to_request_by_solr_field(self, solr_facet_key, new_value): """ uses the solr_facet_key to determine the request parameter """ param = self.get_param_from_solr_facet_key(solr_facet_key) slugs = self.parse_slugs_in_solr_facet_key(solr_facet_key) if slugs is not False: add_to_value = self.hierarchy_delim.join(slugs) else: add_to_value = None #print('New param: ' + param + ' new val: ' + new_value + ' len:' + str(self.base_request)) new_rparams = self.add_to_request(param, new_value, add_to_value) return new_rparams def add_to_request(self, param, new_value, add_to_value=None): """ adds to the new request object a parameter and value """ if self.base_request_json is not False: # start of with JSON encoded base request parameters new_rparams = json.loads(self.base_request_json) elif self.base_r_full_path is not False: # start of with parsing a URL string new_rparams = self.make_base_params_from_url(self.base_r_full_path) elif self.base_request is not False: # start with a dictionary object of the base request # for some reason this often leads to memory errors new_rparams = self.base_request else: new_rparams = {} if 'start' in new_rparams and self.remove_start_param: # remove paging information when composing a new link new_rparams.pop('start', None) if param == 'path': found = self.mem_cache_obj.check_con_entity_found(new_value) if found: # convert the (slug) value into a context path entity = self.mem_cache_obj.get_con_entity(new_value) new_value = entity.context if param not in new_rparams: if param == 'path': new_rparams[param] = new_value else: new_rparams[param] = [new_value] else: if param == 'path': new_rparams['path'] = new_value else: if add_to_value is not None: new_list = [] old_found = False for old_val in new_rparams[param]: old_prefix = self.remove_solr_part(old_val) first_last_old_val = False if self.hierarchy_delim in old_val: old_val_ex = old_val.split(self.hierarchy_delim) if len(old_val_ex) > 2: first_last_old_val = old_val_ex[0] first_last_old_val += self.hierarchy_delim first_last_old_val += old_val_ex[-1] if old_val == add_to_value: old_found = True new_list_val = old_val + self.hierarchy_delim + new_value elif old_prefix == add_to_value: old_found = True new_list_val = old_prefix + self.hierarchy_delim + new_value elif first_last_old_val == add_to_value: old_found = True new_list_val = old_prefix + self.hierarchy_delim + new_value else: new_list_val = old_val new_list.append(new_list_val) if old_found is False: if self.partial_param_val_match: for old_val in new_rparams[param]: if add_to_value in old_val: old_found = True old_prefix = self.remove_solr_part(old_val) new_list_val = old_prefix + self.hierarchy_delim + new_value # add the new item new_list.append(new_list_val) # remove the old new_list.remove(old_val) new_rparams[param] = new_list if old_found is False: new_rparams[param].append(new_value) else: new_rparams[param].append(new_value) return new_rparams def remove_solr_part(self, old_val): """ removes part of a query parameter that is in solr query syntax, inside square brackets [] """ output = old_val splitter = self.hierarchy_delim + '[' if splitter in old_val: old_ex = old_val.split(splitter) output = old_ex[0] return output def make_base_params_from_url(self, request_url): """ makes the base parameters from the url """ rparams = {} url_o = urlparse(request_url) rparams = parse_qs(url_o.query) if self.spatial_context is False: self.spatial_context = self.get_context_from_path(url_o.path) rparams['path'] = self.spatial_context return rparams def get_context_from_path(self, path): """ geths the spatial context from a request path """ context = False if '.' in path: pathex = path.split('.') path = pathex[0] if '/' in path: pathex = path.split('/') print(str(pathex)) if len(pathex) > 2: # remove the part that's the first slash pathex.pop(0) # remove the part that's for the url of search pathex.pop(0) context = '/'.join(pathex) return context def get_param_from_solr_facet_key(self, solr_facet_key): """" returns the public parameter from the solr_facet_key """ output = solr_facet_key exact_match = False for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items(): if solr_field_part_key == solr_facet_key: output = param exact_match = True break if exact_match is False: for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items(): if solr_field_part_key in solr_facet_key: output = param break return output def parse_slugs_in_solr_facet_key(self, solr_facet_key): """ returns a list of slugs encoded in a solr_facet_key the solr field has these slugs in reverse order """ no_slug_field_list = [SolrDocument.ROOT_CONTEXT_SOLR, SolrDocument.ROOT_PROJECT_SOLR, SolrDocument.ROOT_LINK_DATA_SOLR, SolrDocument.ROOT_PREDICATE_SOLR] if solr_facet_key in no_slug_field_list: slugs = False else: raw_slugs = [] facet_key_list = solr_facet_key.split('___') list_len = len(facet_key_list) i = 0 for list_item in facet_key_list: i += 1 if i < list_len: # last item is the suffix for the field type # also replace '_' with '-' to get a slug raw_slugs.append(list_item.replace('_', '-')) slugs = raw_slugs[::-1] return slugs def prep_base_request_obj(self, request_dict): """ prepares a base request object from the old request object to use to create new requests """ self.base_request = request_dict return self.base_request def get_request_param(self, param, default, as_list=False): """ get a string or list to use in queries from either the request object or the internal_request object so we have flexibility in doing searches without having to go through HTTP """ output = False if self.request is not False: if as_list: output = self.request.GET.getlist(param) else: output = self.request.GET.get(param, default=default) elif self.internal_request is not False: if as_list: if param in self.internal_request: param_obj = self.internal_request[param] if isinstance(param_obj, list): output = param_obj else: output = [param_obj] else: if param in self.internal_request: output = self.internal_request[param] else: output = default else: output = False return output
def make_facet_dict_from_solr_field( self, solr_facet_field_key, facet_type, facet_labeling, range_data_type=None, ): """Makes the dict for a fact with id options.""" if configs.FACET_STANDARD_ROOT_FIELDS.get(solr_facet_field_key): # We have a standard "root" field. Return the facet # dict object for it. return configs.FACET_STANDARD_ROOT_FIELDS.get(solr_facet_field_key) solr_slug_parts = solr_facet_field_key.split( SolrDocument.SOLR_VALUE_DELIM) # Making this dict will require some database (usually from # the cache) because it is not a standard root solr field, # rather it is a solr field deeper in a hierarchy. m_cache = MemoryCache() # The solr field parts are in reverse hierarchy order solr_slug_parts.reverse() # Iterate through the parts, skipping the first item # which is the most general part (the field suffix). items = [] for solr_slug in solr_slug_parts[1:]: is_related = False slug = solr_slug.replace('_', '-') if slug.startswith(configs.RELATED_ENTITY_ID_PREFIX): is_related = True slug = slug[len(configs.RELATED_ENTITY_ID_PREFIX):] item = m_cache.get_entity(slug) if not item: continue # Add an "is_related" attribute item.is_related = is_related items.append(item) if not len(items): return None slugs_id = configs.REQUEST_PROP_HIERARCHY_DELIM.join( [item.slug for item in items]) facet = LastUpdatedOrderedDict() if range_data_type is None: id_prefix = 'facet' else: id_prefix = 'range-facet' if is_related: facet['id'] = '#{}-{}{}'.format(id_prefix, configs.RELATED_ENTITY_ID_PREFIX, slugs_id) else: facet['id'] = '#{}-{}'.format(id_prefix, slugs_id) labels = [item.label for item in items] if len(labels) == 1: labels.append(facet_labeling) # Put the last label in parentheses. labels[-1] = '({})'.format(labels[-1]) facet['label'] = ' '.join(labels) facet['rdfs:isDefinedBy'] = items[0].uri facet['slug'] = items[0].slug facet['type'] = facet_type if range_data_type: facet['data-type'] = range_data_type if items[0].is_related: facet['oc-api:related-property'] = True return facet
def projects_json_view(request, spatial_context=None): """ API for searching Open Context, media only """ mem_cache_obj = MemoryCache() mem_cache_obj.ping_redis_server() rd = RequestDict() request_dict_json = rd.make_request_dict_json(request, spatial_context) if rd.security_ok is False: template = loader.get_template('400.html') context = RequestContext(request, {'abusive': True}) return HttpResponse(template.render(context), status=400) elif rd.do_bot_limit: # redirect bot requests away from faceted search where # they can negatively impact performance cache_control(no_cache=True) return redirect('/projects-search/', permanent=False) else: # see if search results are cached. this is not done # with a view decorator, because we want to handle bots differently db_cache = DatabaseCache() cache_key = db_cache.make_cache_key('projects-search', request_dict_json) if rd.refresh_cache: # the request wanted to refresh the cache db_cache.remove_cache_object(cache_key) # get the search result JSON-LD, if it exists in cache json_ld = db_cache.get_cache_object(cache_key) if json_ld is None: # cached result is not found, so make it with a new search solr_s = SolrSearch() solr_s.is_bot = rd.is_bot # True if bot detected solr_s.do_bot_limit = rd.do_bot_limit # Toggle limits on facets for bots solr_s.do_context_paths = False solr_s.item_type_limit = 'projects' if solr_s.solr is not False: response = solr_s.search_solr(request_dict_json) m_json_ld = MakeJsonLd(request_dict_json) m_json_ld.base_search_link = '/projects-search/' # share entities already looked up. Saves database queries m_json_ld.entities = solr_s.entities m_json_ld.request_full_path = request.get_full_path() m_json_ld.spatial_context = spatial_context json_ld = m_json_ld.convert_solr_json(response.raw_content) # now cache the resulting JSON-LD db_cache.save_cache_object(cache_key, json_ld) if json_ld is not None: req_neg = RequestNegotiation('application/json') req_neg.supported_types = ['application/ld+json', 'application/vnd.geo+json'] if 'HTTP_ACCEPT' in request.META: req_neg.check_request_support(request.META['HTTP_ACCEPT']) if req_neg.supported: # requester wanted a mimetype we DO support if 'callback' in request.GET: funct = request.GET['callback'] json_str = json.dumps(json_ld, ensure_ascii=False, indent=4) return HttpResponse(funct + '(' + json_str + ');', content_type='application/javascript' + "; charset=utf8") else: return HttpResponse(json.dumps(json_ld, ensure_ascii=False, indent=4), content_type=req_neg.use_response_type + "; charset=utf8") else: # client wanted a mimetype we don't support return HttpResponse(req_neg.error_message, status=415) else: cache_control(no_cache=True) template = loader.get_template('500.html') context = RequestContext(request, {'error': 'Solr Connection Problem'}) return HttpResponse(template.render(context), status=503)
def __init__(self): self.m_cache = MemoryCache() self.request_full_path = ''
def get_entity(self, identifier): """ gets entities, but checkes first if they are in memory """ mc = MemoryCache() return mc.get_entity(identifier)
class ActiveFilters(): """ Methods to show search / query filters in use """ TEXT_SEARCH_TITLE = 'Current Text Search Filter' IGNORE_PARAMS = ['geodeep', 'chronodeep', 'sort', 'rows', 'start'] def __init__(self): self.m_cache = MemoryCache() # memory caching object self.base_search_link = '/search/' self.hierarchy_delim = '---' def add_filters_json(self, request_dict): """ adds JSON describing search filters """ fl = FilterLinks() fl.base_search_link = self.base_search_link filters = [] string_fields = [] # so we have an interface for string searches i = 0 for param_key, param_vals in request_dict.items(): if param_key == 'path': if param_vals: i += 1 f_entity = self.m_cache.get_entity(param_vals) label = http.urlunquote_plus(param_vals) act_filter = LastUpdatedOrderedDict() act_filter['id'] = '#filter-' + str(i) act_filter['oc-api:filter'] = 'Context' act_filter['label'] = label.replace('||', ' OR ') if f_entity: act_filter['rdfs:isDefinedBy'] = f_entity.uri # generate a request dict without the context filter rem_request = fl.make_request_sub(request_dict, param_key, param_vals) act_filter['oc-api:remove'] = fl.make_request_url(rem_request) act_filter['oc-api:remove-json'] = fl.make_request_url(rem_request, '.json') filters.append(act_filter) else: for param_val in param_vals: i += 1 remove_geodeep = False act_filter = LastUpdatedOrderedDict() act_filter['id'] = '#filter-' + str(i) if self.hierarchy_delim in param_val: all_vals = param_val.split(self.hierarchy_delim) else: all_vals = [param_val] if param_key == 'proj': # projects, only care about the last item in the parameter value act_filter['oc-api:filter'] = 'Project' label_dict = self.make_filter_label_dict(all_vals[-1]) act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri elif param_key == 'prop': # prop, the first item is the filter-label # the last is the filter act_filter['label'] = False if len(all_vals) < 2: act_filter['oc-api:filter'] = 'Description' act_filter['oc-api:filter-slug'] = all_vals[0] else: filt_dict = self.make_filter_label_dict(all_vals[0]) act_filter['oc-api:filter'] = filt_dict['label'] if 'slug' in filt_dict: act_filter['oc-api:filter-slug'] = filt_dict['slug'] if filt_dict['data-type'] == 'string': act_filter['label'] = 'Search Term: \'' + all_vals[-1] + '\'' if act_filter['label'] is False: label_dict = self.make_filter_label_dict(all_vals[-1]) act_filter['label'] = label_dict['label'] elif param_key == 'type': act_filter['oc-api:filter'] = 'Open Context Type' if all_vals[0] in QueryMaker.TYPE_MAPPINGS: type_uri = QueryMaker.TYPE_MAPPINGS[all_vals[0]] label_dict = self.make_filter_label_dict(type_uri) act_filter['label'] = label_dict['label'] else: act_filter['label'] = all_vals[0] elif param_key == 'q': act_filter['oc-api:filter'] = self.TEXT_SEARCH_TITLE act_filter['label'] = 'Search Term: \'' + all_vals[0] + '\'' elif param_key == 'id': act_filter['oc-api:filter'] = 'Identifier Lookup' act_filter['label'] = 'Identifier: \'' + all_vals[0] + '\'' elif param_key == 'form-chronotile': act_filter['oc-api:filter'] = 'Time of formation, use, or life' chrono = ChronoTile() dates = chrono.decode_path_dates(all_vals[0]) if isinstance(dates, dict): act_filter['label'] = 'Time range: ' + str(dates['earliest_bce']) act_filter['label'] += ' to ' + str(dates['latest_bce']) elif param_key == 'form-start': act_filter['oc-api:filter'] = 'Earliest formation, use, or life date' try: val_date = int(float(all_vals[0])) except: val_date = False if val_date is False: act_filter['label'] = '[Invalid year]' elif val_date < 0: act_filter['label'] = str(val_date * -1) + ' BCE' else: act_filter['label'] = str(val_date) + ' CE' elif param_key == 'form-stop': act_filter['oc-api:filter'] = 'Latest formation, use, or life date' try: val_date = int(float(all_vals[0])) except: val_date = False if val_date is False: act_filter['label'] = '[Invalid year]' elif val_date < 0: act_filter['label'] = str(val_date * -1) + ' BCE' else: act_filter['label'] = str(val_date) + ' CE' elif param_key == 'disc-geotile': act_filter['oc-api:filter'] = 'Location of discovery or observation' act_filter['label'] = self.make_geotile_filter_label(all_vals[0]) remove_geodeep = True elif param_key == 'disc-bbox': act_filter['oc-api:filter'] = 'Location of discovery or observation' act_filter['label'] = self.make_bbox_filter_label(all_vals[0]) remove_geodeep = True elif param_key == 'images': act_filter['oc-api:filter'] = 'Has related media' act_filter['label'] = 'Linked to images' elif param_key == 'other-media': act_filter['oc-api:filter'] = 'Has related media' act_filter['label'] = 'Linked to media (other than images)' elif param_key == 'documents': act_filter['oc-api:filter'] = 'Has related media' act_filter['label'] = 'Linked to documents' elif param_key == 'dc-subject': act_filter['oc-api:filter'] = 'Has subject metadata' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if 'tdar' == all_vals[-1] or 'tdar*' == all_vals[-1]: act_filter['label'] = 'tDAR defined metadata record(s)' if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri if label_dict['entities'][0].vocabulary is not False: act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary elif param_key == 'dc-spatial': act_filter['oc-api:filter'] = 'Has spatial metadata' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri if label_dict['entities'][0].vocabulary is not False: act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary elif param_key == 'dc-coverage': act_filter['oc-api:filter'] = 'Has coverage / period metadata' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri if label_dict['entities'][0].vocabulary is not False: act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary elif param_key == 'dc-temporal': act_filter['oc-api:filter'] = 'Has temporal coverage' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: if label_dict['entities'][0].entity_type == 'vocabulary': act_filter['label'] = 'Concepts defined by: ' + label_dict['label'] elif 'periodo' in all_vals[-1]: act_filter['label'] = 'PeriodO defined concepts' if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri if label_dict['entities'][0].vocabulary is not False\ and label_dict['entities'][0].vocabulary != label_dict['label']: act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary elif param_key == 'obj': act_filter['oc-api:filter'] = 'Links (in some manner) to object' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri if label_dict['entities'][0].vocabulary is not False: act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary elif param_key == 'dc-isReferencedBy': act_filter['oc-api:filter'] = 'Is referenced by' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri if label_dict['entities'][0].vocabulary is not False\ and label_dict['entities'][0].vocab_uri != label_dict['entities'][0].uri: act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary elif param_key == 'linked' and all_vals[-1] == 'dinaa-cross-ref': act_filter['oc-api:filter'] = 'Has cross references' act_filter['label'] = 'Links to, or with, DINAA curated site files' else: act_filter = False if act_filter is not False: rem_request = fl.make_request_sub(request_dict, param_key, param_val) if 'geodeep' in rem_request and remove_geodeep: rem_request.pop('geodeep', None) act_filter['oc-api:remove'] = fl.make_request_url(rem_request) act_filter['oc-api:remove-json'] = fl.make_request_url(rem_request, '.json') filters.append(act_filter) return filters def make_geotile_filter_label(self, raw_geotile): """ parses a raw bbox parameter value to make a filter label """ output_list = [] if '||' in raw_geotile: tile_list = raw_geotile.split('||') else: tile_list = [raw_geotile] for tile in tile_list: geotile = GlobalMercator() coordinates = geotile.quadtree_to_lat_lon(tile) if coordinates is not False: label = 'In the region bounded by: ' label += str(round(coordinates[0], 3)) label += ', ' + str(round(coordinates[1], 3)) label += ' (SW) and ' + str(round(coordinates[2], 3)) label += ', ' + str(round(coordinates[3], 3)) label += ' (NE)' output_list.append(label) else: output_list.append('[Ignored invalid geospatial tile]') output = '; or '.join(output_list) return output def make_bbox_filter_label(self, raw_disc_bbox): """ parses a raw bbox parameter value to make a filter label """ qm = QueryMaker() output_list = [] if '||' in raw_disc_bbox: bbox_list = raw_disc_bbox.split('||') else: bbox_list = [raw_disc_bbox] for bbox in bbox_list: if ',' in bbox: bbox_coors = bbox.split(',') bbox_valid = qm.validate_bbox_coordiantes(bbox_coors) if bbox_valid: label = 'In the bounding-box of: Latitude ' label += str(bbox_coors[1]) label += ', Longitude ' + str(bbox_coors[0]) label += ' (SW) and Latitude ' + str(bbox_coors[3]) label += ', Longitude ' + str(bbox_coors[2]) label += ' (NE)' output_list.append(label) else: output_list.append('[Ignored invalid bounding-box]') else: output_list.append('[Ignored invalid bounding-box]') output = '; or '.join(output_list) return output def make_filter_label_dict(self, act_val): """ returns a dictionary object with a label and set of entities (in cases of OR searchs) """ related_suffix = '' output = {'label': False, 'data-type': 'id', 'slug': False, 'entities': []} labels = [] if '||' in act_val: vals = act_val.split('||') else: vals = [act_val] for val in vals: qm = QueryMaker() db_val = qm.clean_related_slug(val) if val != db_val: related_suffix = ' (for related items)' f_entity = self.m_cache.get_entity(db_val) if f_entity: # get the solr field data type ent_solr_data_type = qm.get_solr_field_type(f_entity.data_type) if ent_solr_data_type is not False \ and ent_solr_data_type != 'id': output['data-type'] = ent_solr_data_type labels.append(f_entity.label) output['entities'].append(f_entity) else: labels.append(val) output['label'] = (' OR '.join(labels)) + related_suffix output['slug'] = '-or-'.join(vals) return output
def __init__(self, cannonical_uris=False): self.m_cache = MemoryCache()
class QueryMaker(): # main item-types mapped to their slugs to get solr-facet field prefix TYPE_MAPPINGS = {'subjects': 'oc-gen-subjects', 'media': 'oc-gen-media', 'documents': 'oc-gen-documents', 'persons': 'oc-gen-persons', 'projects': 'oc-gen-projects', 'types': 'oc-gen-types', 'predicates': 'oc-gen-predicates'} TYPE_URIS = {'subjects': 'oc-gen:subjects', 'media': 'oc-gen:media', 'documents': 'oc-gen:documents', 'persons': 'oc-gen:persons', 'projects': 'oc-gen:projects', 'types': 'oc-gen:types', 'predicates': 'oc-gen:predicates'} def __init__(self): self.error = False self.histogram_groups = 10 self.mem_cache_obj = MemoryCache() # memory caching object def _get_context_paths(self, spatial_context): ''' Takes a context path and returns an iterator with the list of possible contexts. Parses the list of boolean '||' (OR) and returns a list of contexts. For example: >>> _get_context_paths('Turkey/Domuztepe/I||II||Stray') ['Turkey/Domuztepe/I', 'Turkey/Domuztepe/II', 'Turkey/Domuztepe/Stray'] ''' # Split the context path by '/' and then by '||' context_lists = (value.split('||') for value in spatial_context.split('/')) # Create a list of the various permutations context_tuple_list = list(itertools.product(*context_lists)) # Turn the lists back into URIs return ('/'.join(value) for value in context_tuple_list) def _get_context_depth(self, spatial_context): ''' Takes a context path and returns its depth as an interger. For example, the context '/Turkey/Domuztepe' would have a depth of 2. ''' # Remove a possible trailing slash before calculating the depth return len(spatial_context.rstrip('/').split('/')) def _get_valid_context_slugs(self, contexts): ''' Takes a list of contexts and, for valid contexts, returns a list of slugs ''' entity = Entity() valid_context_slugs = [] context_list = list(contexts) for context in context_list: # Verify that the contexts are valid # find and save the enity to memory # print('check: ' + context) found = self.mem_cache_obj.check_entity_found(context, True) # print('found: ' + str(found)) if found: entity = self.mem_cache_obj.get_entity(context, True) valid_context_slugs.append(entity.slug) return valid_context_slugs def _get_parent_slug(self, slug): ''' Takes a slug and returns the slug of its parent. Returns 'root' if a slug has no parent. ''' cache_key = self.mem_cache_obj.make_memory_cache_key('par-slug', slug) parent_slug = self.mem_cache_obj.get_cache_object(cache_key) if parent_slug is None: contain_obj = Containment() contain_obj.use_cache = False # because it seems to introduce memory errors parent_slug = contain_obj.get_parent_slug_by_slug(slug) self.mem_cache_obj.save_cache_object(cache_key, parent_slug) if parent_slug: return parent_slug else: return 'root' def _prepare_filter_query(self, parent_child_slug): # TODO docstring parent_child_set = parent_child_slug.split('___') return parent_child_set[0].replace('-', '_') + '___context_id_fq:' + \ parent_child_set[1] def expand_hierarchy_options(self, path_param_val, hier_delim='---', or_delim='||'): """ Exapands a hiearchic path string into a list of listed hierachically ordered items. This method also makes a new hiearchic ordered list if there is an 'or_delim'. """ if isinstance(path_param_val, list): inital_path_list = path_param_val else: inital_path_list = [path_param_val] path_list = [] for path_string in inital_path_list: raw_path_list = (value.split(or_delim) for value in path_string.split(hier_delim)) # Create a list of the various permutations path_tuple_list = list(itertools.product(*raw_path_list)) for item in path_tuple_list: path_list.append(list(item)) return path_list def get_solr_field_type(self, data_type, prefix=''): ''' Defines whether our dynamic solr fields names for predicates end with ___pred_id, ___pred_numeric, etc. ''' if data_type in ['@id', 'id', False]: return prefix + 'id' elif data_type in ['xsd:integer', 'xsd:double', 'xsd:boolean']: return prefix + 'numeric' elif data_type == 'xsd:string': return prefix + 'string' elif data_type == 'xsd:date': return prefix + 'date' else: raise Exception("Error: Unknown predicate type") def make_prop_solr_field_parts(self, entity): """ Makes a solr field for a property """ output = {} output['prefix'] = entity.slug.replace('-', '_') output['suffix'] = self.get_solr_field_type(entity.data_type) return output def process_proj(self, proj_path): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] project_path_lists = self.expand_hierarchy_options(proj_path) for proj_path_list in project_path_lists: i = 0 path_list_len = len(proj_path_list) fq_field = SolrDocument.ROOT_PROJECT_SOLR fq_path_terms = [] for proj_slug in proj_path_list: found = self.mem_cache_obj.check_entity_found(proj_slug, False) if found: entity = self.mem_cache_obj.get_entity(proj_slug, False) # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now proj_slug = entity.slug fq_path_term = fq_field + ':' + proj_slug + '*' else: fq_path_term = fq_field + ':' + proj_slug fq_path_terms.append(fq_path_term) fq_field = proj_slug.replace('-', '_') + '___project_id' i += 1 if i >= path_list_len and fq_field not in query_dict['facet.field']: query_dict['facet.field'].append(fq_field) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_ld_object(self, objects): # TODO docstring query_dict = {'fq': []} fq_terms = [] if not isinstance(objects, list): objects = [objects] for raw_obj in objects: if '||' in raw_obj: or_objects = raw_obj.split('||') else: or_objects = [raw_obj] fq_or_terms = [] for obj in or_objects: # find and save the entity to memory found = self.mem_cache_obj.check_entity_found(obj, False) if found: entity = self.mem_cache_obj.get_entity(obj, False) fq_term = 'object_uri:' + self.escape_solr_arg(entity.uri) fq_term += ' OR text:"' + self.escape_solr_arg(entity.uri) + '"' else: fq_term = 'object_uri:' + obj fq_or_terms.append(fq_term) fq_all_ors = ' OR '.join(fq_or_terms) fq_all_ors = '(' + fq_all_ors + ')' fq_terms.append(fq_all_ors) fq_final = ' AND '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_dc_term(self, dc_param, dc_terms, add_facet=False): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] if dc_param in DCterms.DC_META_FIELDS: fq_field = DCterms.DC_META_FIELDS[dc_param] if fq_field not in query_dict['facet.field'] and add_facet: query_dict['facet.field'].append(fq_field) add_to_fq = False for raw_dc_term in dc_terms: if '||' in raw_dc_term: use_dc_terms = raw_dc_term.split('||') else: use_dc_terms = [raw_dc_term] fq_path_terms = [] for dc_term in use_dc_terms: if len(dc_term) > 0: add_to_fq = True # check if entity exists, and or store in memory found = self.mem_cache_obj.check_entity_found(dc_term, False) if found: # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now entity = self.mem_cache_obj.get_entity(dc_term, False) fq_path_term = fq_field + '_fq:' + entity.slug if dc_param == 'dc-temporal' \ and entity.entity_type == 'vocabulary' \ and 'periodo' in entity.slug: # it's a temporal vocabulary from periodo # so search for specific periods contained in # the vocabulary fq_path_term = '(' + fq_path_term +\ ' OR ' + fq_path_term + '*)' else: if dc_term[-1] != '*': dc_term += '*' fq_path_term = fq_field + ':' + dc_term fq_path_terms.append(fq_path_term) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' if add_to_fq: query_dict['fq'].append(fq_final) return query_dict def get_related_slug_field_prefix(self, slug): """ gets the field prefix for a related property if it is present in the slug, then return the solr_field prefix otherwise return a '' string """ field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX prefix_len = len(field_prefix) slug_start = slug[:prefix_len] if slug_start == field_prefix: return field_prefix else: return '' def clean_related_slug(self, slug): """ removes the field_prefix for related slugs """ field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX prefix_len = len(field_prefix) slug_start = slug[:prefix_len] if slug_start == field_prefix: slug = slug[prefix_len:] return slug def correct_solr_prefix_for_fq(self, solr_f_prefix, act_field_fq): """ makes sure the solr prefix is on the fq if needed """ if solr_f_prefix != '': if solr_f_prefix not in act_field_fq: act_field_fq = solr_f_prefix + act_field_fq return act_field_fq def process_prop(self, props): """ processes 'prop' (property) parameters property parameters are tricky because they can come in hierarchies that's why there's some complexity to this """ # is the property for the item itself, or for a related item? query_dict = {'fq': [], 'facet.field': [], 'stats.field': [], 'prequery-stats': [], 'facet.range': [], 'hl-queries': [], 'ranges': {}} fq_terms = [] prop_path_lists = self.expand_hierarchy_options(props) for prop_path_list in prop_path_lists: i = 0 path_list_len = len(prop_path_list) fq_path_terms = [] act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR act_field_data_type = 'id' last_field_label = False # needed for full text highlighting predicate_solr_slug = False for prop_slug in prop_path_list: field_prefix = self.get_related_slug_field_prefix(prop_slug) solr_f_prefix = field_prefix.replace('-', '_') db_prop_slug = self.clean_related_slug(prop_slug) l_prop_entity = False pred_prop_entity = False require_id_field = False if act_field_data_type == 'id': # check entity exists, and save to memory found = self.mem_cache_obj.check_entity_found(db_prop_slug, False) if found: entity = self.mem_cache_obj.get_entity(db_prop_slug, False) last_field_label = entity.label prop_slug = field_prefix + entity.slug if entity.item_type == 'uri' and 'oc-gen' not in db_prop_slug: if entity.entity_type == 'property': pred_prop_entity = True predicate_solr_slug = prop_slug.replace('-', '_') l_prop_entity = True children = self.mem_cache_obj.get_entity_children(entity.uri) if len(children) > 0: # ok, this field has children. require it # to be treated as an ID field require_id_field = True else: if entity.item_type == 'predicates': pred_prop_entity = True predicate_solr_slug = prop_slug.replace('-', '_') children = self.mem_cache_obj.get_entity_children(entity.uri) if len(children) > 0: # ok, this field has children. require it # to be treated as an ID field require_id_field = True if i == 0: if 'oc-gen' in db_prop_slug: # for open context categories / types act_field_fq = self.get_parent_item_type_facet_field(entity.uri) lr = LinkRecursion() parents = lr.get_jsonldish_entity_parents(entity.uri) if len(parents) > 1: try: p_slug = parents[-2]['slug'] act_field_fq = p_slug.replace('-', '_') + '___pred_id' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) except: pass elif entity.item_type == 'uri': act_field_fq = SolrDocument.ROOT_LINK_DATA_SOLR elif entity.item_type == 'predicates': temp_field_fq = self.get_parent_item_type_facet_field(entity.uri) parents = self.mem_cache_obj.get_jsonldish_entity_parents(entity.uri) if len(parents) > 1: try: p_slug = parents[-2]['slug'] temp_field_fq = p_slug.replace('-', '_') + '___pred_id' except: print('Predicate Parent exception: '+ str(parents)) temp_field_fq = False if temp_field_fq is not False: act_field_fq = temp_field_fq else: act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR else: act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR # --------------------------------------------------- # THIS PART BUILDS THE FACET-QUERY # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now fq_field = act_field_fq + '_fq' if path_list_len >= 2 and act_field_data_type == 'id': # could be an object deeper in the hierarchy, so allow the obj_all version fq_path_term = '(' + fq_field + ':' + prop_slug fq_path_term += ' OR obj_all___' + fq_field + ':' + prop_slug + ')' else: fq_path_term = fq_field + ':' + prop_slug fq_path_terms.append(fq_path_term) #--------------------------------------------------- # #--------------------------------------------------- # THIS PART PREPARES FOR LOOPING OR FINAL FACET-FIELDS # # print('pred-solr-slug: ' + predicate_solr_slug) field_parts = self.make_prop_solr_field_parts(entity) act_field_data_type = field_parts['suffix'] if require_id_field: act_field_data_type = 'id' field_parts['suffix'] = 'id' # check if the last or penultimate field has # a different data-type (for linked-data) if i >= (path_list_len - 2) \ and l_prop_entity: dtypes = self.mem_cache_obj.get_dtypes(entity.uri) if isinstance(dtypes, list): # set te data type and the act-field found = self.mem_cache_obj.check_entity_found(db_prop_slug, False) if found: entity = self.mem_cache_obj.get_entity(db_prop_slug, False) entity.date_type = dtypes[0] # store for later use self.mem_cache_obj.entities[db_prop_slug] = entity # store for later use act_field_data_type = self.get_solr_field_type(dtypes[0]) if predicate_solr_slug is False or pred_prop_entity: act_field_fq = field_parts['prefix'] + '___pred_' + field_parts['suffix'] act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) # get a facet on this field if act_field_data_type != 'string': # adds a prefix for related properties ffield = solr_f_prefix + field_parts['prefix'] + '___pred_' + field_parts['suffix'] if ffield not in query_dict['facet.field'] \ and i >= (path_list_len - 1): query_dict['facet.field'].append(ffield) else: if act_field_data_type == 'id': act_field_fq = 'obj_all___' + predicate_solr_slug \ + '___pred_' + field_parts['suffix'] # get a facet on this field if predicate_solr_slug != field_parts['prefix']: # the predicate_solr_slug is not the # prefix of the current field part, meaning # the field_parts[prefix] is the type, and # we want facets for the predicate -> type ffield = field_parts['prefix'] \ + '___' \ + predicate_solr_slug \ + '___pred_' + field_parts['suffix'] else: # get facets for the predicate ffield = field_parts['prefix'] \ + '___pred_' \ + field_parts['suffix'] # adds a prefix, in case of a related property ffield = solr_f_prefix + ffield if ffield not in query_dict['facet.field'] \ and i >= (path_list_len - 1): query_dict['facet.field'].append(ffield) else: act_field_fq = predicate_solr_slug + '___pred_' + field_parts['suffix'] # ------------------------------------------- if act_field_data_type == 'numeric': # print('Numeric field: ' + act_field) act_field_fq = field_parts['prefix'] + '___pred_numeric' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) query_dict = self.add_math_facet_ranges(query_dict, act_field_fq, entity) elif act_field_data_type == 'date': # print('Date field: ' + act_field) act_field_fq = field_parts['prefix'] + '___pred_date' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) query_dict = self.add_date_facet_ranges(query_dict, act_field_fq, entity) # print('Current data type (' + str(i) + '): ' + act_field_data_type) # print('Current field (' + str(i) + '): ' + act_field_fq) i += 1 elif act_field_data_type == 'string': # case for a text search # last_field_label = False # turn off using the field label for highlighting string_terms = self.prep_string_search_term(prop_slug) for escaped_term in string_terms: search_term = act_field_fq + ':' + escaped_term if last_field_label is False: query_dict['hl-queries'].append(escaped_term) else: query_dict['hl-queries'].append(last_field_label + ' ' + escaped_term) fq_path_terms.append(search_term) elif act_field_data_type == 'numeric': # numeric search. assume it's well formed solr numeric request search_term = act_field_fq + ':' + prop_slug fq_path_terms.append(search_term) # now limit the numeric ranges from query to the range facets query_dict = self.add_math_facet_ranges(query_dict, act_field_fq, False, prop_slug) elif act_field_data_type == 'date': # date search. assume it's well formed solr request search_term = act_field_fq + ':' + prop_slug fq_path_terms.append(search_term) # now limit the date ranges from query to the range facets query_dict = self.add_date_facet_ranges(query_dict, act_field_fq, False, prop_slug) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def add_math_facet_ranges(self, query_dict, act_field, entity=False, solr_query=False): """ this does some math for facet ranges for numeric fields """ ok = False groups = self.histogram_groups fstart = 'f.' + act_field + '.facet.range.start' fend = 'f.' + act_field + '.facet.range.end' fgap = 'f.' + act_field + '.facet.range.gap' findex = 'f.' + act_field + '.facet.sort' fother = 'f.' + act_field + '.facet.range.other' finclude = 'f.' + act_field + '.facet.range.include' if entity is not False: # this is a field with no value limits # we need to do a stats-prequery first query_dict['prequery-stats'].append(act_field) else: if solr_query is not False: vals = [] # get the numbers out q_nums_strs = re.findall(r'[-+]?\d*\.\d+|\d+', solr_query) for q_num_str in q_nums_strs: vals.append(float(q_num_str)) vals.sort() if len(vals) > 1: ok = True min_val = vals[0] max_val = vals[-1] if ok: if act_field not in query_dict['stats.field']: query_dict['stats.field'].append(act_field) if act_field not in query_dict['facet.range']: query_dict['facet.range'].append(act_field) query_dict['ranges'][fother] = 'all' query_dict['ranges'][finclude] = 'all' query_dict['ranges'][fstart] = min_val query_dict['ranges'][fend] = max_val query_dict['ranges'][fgap] = (max_val - min_val) / groups query_dict['ranges'][findex] = 'index' # sort by index, not by count return query_dict def add_date_facet_ranges(self, query_dict, act_field, entity=False, solr_query=False): """ this does some math for facet ranges for numeric fields """ ok = False groups = 4 fstart = 'f.' + act_field + '.facet.range.start' fend = 'f.' + act_field + '.facet.range.end' fgap = 'f.' + act_field + '.facet.range.gap' findex = 'f.' + act_field + '.facet.sort' fother = 'f.' + act_field + '.facet.range.other' finclude = 'f.' + act_field + '.facet.range.include' if entity is not False: # this is a field with no value limits # we need to do a stats-prequery first query_dict['prequery-stats'].append(act_field) else: if solr_query is not False: q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}[T:]\d{2}:\d{2}:\d{2}', solr_query) if len(q_dt_strs) < 2: # try a less strict regular expression to get dates q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}', solr_query) if len(q_dt_strs) >= 2: ok = True vals = [] for q_dt_str in q_dt_strs: vals.append(q_dt_str) vals.sort() min_val = vals[0] max_val = vals[1] if ok: if act_field not in query_dict['stats.field']: query_dict['stats.field'].append(act_field) if act_field not in query_dict['facet.range']: query_dict['facet.range'].append(act_field) query_dict['ranges'][fother] = 'all' query_dict['ranges'][finclude] = 'all' query_dict['ranges'][fstart] = self.convert_date_to_solr_date(min_val) query_dict['ranges'][fend] = self.convert_date_to_solr_date(max_val) query_dict['ranges'][fgap] = self.get_date_difference_for_solr(min_val, max_val, groups) query_dict['ranges'][findex] = 'index' # sort by index, not by count return query_dict def get_date_difference_for_solr(self, min_date, max_date, groups): """ Gets a solr date difference from two values """ min_dt = self.date_convert(min_date) max_dt = self.date_convert(max_date) dif_dt = (max_dt - min_dt) / groups if dif_dt.days >= 366: solr_val = int(round((dif_dt.days / 365.25), 0)) solr_dif = '+' + str(solr_val) + 'YEAR' elif dif_dt.days >= 31: solr_val = int(round((dif_dt.days / 30), 0)) solr_dif = '+' + str(solr_val) + 'MONTH' elif dif_dt.days >= 1: solr_val = int(round(dif_dt.days, 0)) solr_dif = '+' + str(solr_val) + 'DAY' elif (dif_dt.seconds // 3600) >= 1: solr_val = int(round((dif_dt.seconds // 3600), 0)) solr_dif = '+' + str(solr_val) + 'HOUR' elif ((dif_dt.seconds % 3600) // 60) >= 1: solr_val = int(round(((dif_dt.seconds % 3600) // 60), 0)) solr_dif = '+' + str(solr_val) + 'MINUTE' elif dif_dt.seconds >= 1: solr_val = int(round(dif_dt.seconds, 0)) solr_dif = '+' + str(solr_val) + 'SECOND' else: solr_dif = '+1YEAR' return solr_dif def add_solr_gap_to_date(self, date_val, solr_gap): """ adds a solr gap to a date_val """ solr_val = re.sub(r'[^\d.]', r'', solr_gap) solr_val = int(float(solr_val)) dt = self.date_convert(date_val) if 'YEAR' in solr_gap: dt = dt + datetime.timedelta(days=int(round((solr_val * 365.25), 0))) elif 'MONTH' in solr_gap: dt = dt + datetime.timedelta(days=(solr_val * 30)) elif 'DAY' in solr_gap: dt = dt + datetime.timedelta(days=solr_val) elif 'HOUR' in solr_gap: dt = dt + datetime.timedelta(hours=solr_val) elif 'MINUTE' in solr_gap: dt = dt + datetime.timedelta(minutes=solr_val) elif 'SECOND' in solr_gap: dt = dt + datetime.timedelta(seconds=solr_val) else: dt = dt return dt def convert_date_to_solr_date(self, date_val): """ Conversts a string for a date into a Solr formated datetime string """ dt = self.date_convert(date_val) return dt.strftime('%Y-%m-%dT%H:%M:%SZ') def make_human_readable_date(self, date_val): """ Converts a date value into something easier to read """ dt = self.date_convert(date_val) check_date = dt.strftime('%Y-%m-%d') check_dt = self.date_convert(date_val) if check_dt == dt: return check_date else: return dt.strftime('%Y-%m-%d:%H:%M:%S') def date_convert(self, date_val): """ converts to a python datetime if not already so """ if isinstance(date_val, str): date_val = date_val.replace('Z', '') dt = datetime.datetime.strptime(date_val, '%Y-%m-%dT%H:%M:%S') else: dt = date_val return dt def get_parent_item_type_facet_field(self, category_uri): """ Gets the parent facet field for a given category_uri. This assumes the category_uri is an entity that exists in the database. """ output = False; parents = LinkRecursion().get_jsonldish_entity_parents(category_uri) for par in parents: if par['slug'] in self.TYPE_MAPPINGS.values(): # the parent exists in the Type Mappings output = par['slug'].replace('-', '_') + '___pred_id' break return output def get_parent_entity_facet_field(self, entity_uri): """ Gets the parent facet field for a given category_uri. This assumes the category_uri is an entity that exists in the database. """ output = False; parents = LinkRecursion().get_jsonldish_entity_parents(entity_uri) if isinstance(parents, list): if len(parents) > 1: # get the penultimate field output = parents[-2]['slug'].replace('-', '_') + '___pred_id' return output def process_item_type(self, raw_item_type): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] item_type_lists = self.expand_hierarchy_options(raw_item_type) for item_type_list in item_type_lists: i = 0 path_list_len = len(item_type_list) fq_path_terms = [] item_type = item_type_list[0] # no hiearchy in this field, just the type fq_term = 'item_type:' + item_type fq_terms.append(fq_term) if item_type in self.TYPE_MAPPINGS: act_field = self.TYPE_MAPPINGS[item_type].replace('-', '_') + '___pred_id' query_dict['facet.field'].append(act_field) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_id(self, identifier): # check for identifier query_dict = {'fq': [], 'facet.field': []} fq_terms = [] escape_id = self.escape_solr_arg(identifier) fq_terms.append('persistent_uri:' + escape_id) # now make a DOI URI in case this is just a naked DOI doi_uri = self.escape_solr_arg('http://dx.doi.org/' + identifier) fq_terms.append('persistent_uri:' + doi_uri) # now make an ARK URI in case this is just a naked ARK ark_uri = self.escape_solr_arg('http://n2t.net/' + identifier) fq_terms.append('persistent_uri:' + ark_uri) # now make an ORCID URI in case this is just a naked ORCID orcid_uri = self.escape_solr_arg('http://orcid.org/' + identifier) fq_terms.append('persistent_uri:' + orcid_uri) fq_terms.append('uuid:' + escape_id) tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True) if tcheck is not False: uuid = tcheck['uuid'] fq_terms.append('uuid:' + uuid) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) # print(fq_final) return query_dict def process_form_use_life_chrono(self, raw_form_use_life_chrono): # creates facet query for form-use-life chronological tiles # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} fq_terms = [] query_dict['facet.field'].append('form_use_life_chrono_tile') if '||' in raw_form_use_life_chrono: chrono_paths = raw_form_use_life_chrono.split('||') else: chrono_paths = [raw_form_use_life_chrono] for chrono_path in chrono_paths: i = 0 if len(chrono_path) < 30: chrono_path += '*' fq_term = 'form_use_life_chrono_tile:' + chrono_path fq_terms.append(fq_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_form_date_chrono(self, form_use_life_date, date_type): # creates facet query for form-use-life dates # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} if date_type == 'start': qterm = '[' + str(form_use_life_date) + ' TO *]' fquery = 'form_use_life_chrono_earliest: ' + qterm else: qterm = '[* TO ' + str(form_use_life_date) + ']' fquery = 'form_use_life_chrono_latest: ' + qterm query_dict['fq'].append(fquery) return query_dict def process_discovery_geo(self, raw_disc_geo): # creates facet query for discovery geotiles # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} fq_terms = [] query_dict['facet.field'].append('discovery_geotile') if '||' in raw_disc_geo: disc_geo_paths = raw_disc_geo.split('||') else: disc_geo_paths = [raw_disc_geo] for disc_path in disc_geo_paths: i = 0 if len(disc_path) < 20: disc_path += '*' fq_term = 'discovery_geotile:' + disc_path fq_terms.append(fq_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_discovery_bbox(self, raw_disc_bbox): # creates facet query for bounding box searches # supports or {'||') queries query_dict = {'fq': []} fq_terms = [] if '||' in raw_disc_bbox: bbox_list = raw_disc_bbox.split('||') else: bbox_list = [raw_disc_bbox] for bbox in bbox_list: if ',' in bbox: # comma seperated list of coordinates bbox_coors = bbox.split(',') bbox_valid = self.validate_bbox_coordiantes(bbox_coors) if bbox_valid: # valid bounding box, now make a solr-query # not how solr expacts latitude / longitude order, which # is the revserse of geojson! fq_term = 'discovery_geolocation:' fq_term += '[' + str(bbox_coors[1]) + ',' + str(bbox_coors[0]) fq_term += ' TO ' + str(bbox_coors[3]) + ',' + str(bbox_coors[2]) fq_term += ']' fq_terms.append(fq_term) if len(fq_terms) > 0: fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def validate_bbox_coordiantes(self, bbox_coors): """ validates a set of bounding box coordinates """ is_valid = False if len(bbox_coors) == 4: lower_left_valid = self.validate_geo_lon_lat(bbox_coors[0], bbox_coors[1]) top_right_valid = self.validate_geo_lon_lat(bbox_coors[2], bbox_coors[3]) # print('ok: ' + str(lower_left_valid) + ' ' + str(top_right_valid)) if lower_left_valid and top_right_valid: if float(bbox_coors[0]) < float(bbox_coors[2]) and\ float(bbox_coors[1]) < float(bbox_coors[3]): is_valid = True return is_valid def validate_geo_lon_lat(self, lon, lat): """ checks to see if a lon, lat pair are valid. Note the GeoJSON ordering of the coordinates """ is_valid = False lon_valid = self.validate_geo_coordinate(lon, 'lon') lat_valid = self.validate_geo_coordinate(lat, 'lat') if lon_valid and lat_valid: is_valid = True return is_valid def validate_geo_coordinate(self, coordinate, coord_type): """ validates a geo-spatial coordinate """ is_valid = False try: fl_coord = float(coordinate) except ValueError: fl_coord = False if fl_coord is not False: if 'lat' in coord_type: if fl_coord <= 90 and\ fl_coord >= -90: is_valid = True elif 'lon' in coord_type: if fl_coord <= 180 and\ fl_coord >= -180: is_valid = True return is_valid def make_solr_value_from_entity(self, entity, value_type='id'): """ makes a solr value as indexed in SolrDocument see _concat_solr_string_value """ id_part = entity.uri if 'http://opencontext.org' in entity.uri: if '/vocabularies/' not in entity.uri: id_part = entity.uri.split('http://opencontext.org')[1] return entity.slug + '___' + value_type + '___' + \ id_part + '___' + entity.label return output def _process_spatial_context(self, spatial_context=None): # TODO docstring context = {} if spatial_context: context_paths = self._get_context_paths(spatial_context) context_slugs = self._get_valid_context_slugs(context_paths) # print('Context slugs: ' + str(context_slugs)) # If we cannot find a valid context, raise a 404 if not context_slugs: raise Http404 # Solr 'fq' parameters parent_child_slugs = [] # Solr 'facet.field' parameters facet_field = [] for slug in context_slugs: # fq parameters parent_child_slugs.append(self._get_parent_slug(slug) + '___' + slug) # facet.field parameters facet_field.append(slug.replace('-', '_') + '___context_id') # First, handle the most likely scenario of a single context if len(parent_child_slugs) == 1: context['fq'] = self._prepare_filter_query(parent_child_slugs[0]) # Otherwise, combine multiple contexts into an OR filter else: fq_string = ' OR '.join( (self._prepare_filter_query(slug_set) for slug_set in parent_child_slugs) ) context['fq'] = '(' + fq_string + ')' context['facet.field'] = facet_field # No spatial context provided else: context['fq'] = None context['facet.field'] = ['root___context_id'] return context def prep_string_search_term(self, raw_term): """ prepares a string search returns a list of search terms for AND queries """ if '"' in raw_term: nq_term = raw_term.replace('"', ' ') # get rid of quotes in the search term quoted_list = re.findall(r"\"(.*?)\"", raw_term) terms = [] terms.append(self.escape_solr_arg(nq_term)) for quote_item in quoted_list: quote_item = self.escape_solr_arg(quote_item) # escape characters quote_item = '"' + quote_item + '"' # put quotes back around it terms.append(quote_item) else: terms = [] terms.append(self.escape_solr_arg(raw_term)) return terms def escaped_seq(self, term): """ Yield the next string based on the next character (either this char or escaped version """ escaperules = {'+': r'\+', '-': r'\-', '&': r'\&', '|': r'\|', '!': r'\!', '(': r'\(', ')': r'\)', '{': r'\{', '}': r'\}', '[': r'\[', ']': r'\]', '^': r'\^', '~': r'\~', '*': r'\*', '?': r'\?', ':': r'\:', '"': r'\"', ';': r'\;', ' ': r'\ '} for char in term: if char in escaperules.keys(): yield escaperules[char] else: yield char def escape_solr_arg(self, term): """ Apply escaping to the passed in query terms escaping special characters like : , etc""" term = term.replace('\\', r'\\') # escape \ first return "".join([next_str for next_str in self.escaped_seq(term)])
class RecordProperties(): """ Methods to make properties for individual record items useful for making geospatial feature records or lists of items without geospatial data """ ATTRIBUTE_DELIM = '; ' # delimiter for multiple attributes def __init__(self, request_dict_json=False): self.uuid = False self.uri = False # cannonical uri for the item self.href = False # link to the item in the current deployment self.cite_uri = False # stable / persistent uri self.label = False self.item_type = False self.updated = False self.published = False self.project_href = False # link to the project in current deployment self.project_uri = False # cannonical uri for the project self.project_label = False self.context_href = False # link to parent context in current deployment self.context_uri = False # link to parent context cannonical uri self.context_label = False self.category = False self.latitude = False self.longitude = False self.geojson = False self.early_date = False self.late_date = False self.thumbnail_href = False self.thumbnail_uri = False self.thumbnail_scr = False self.preview_scr = False self.fullfile_scr = False self.snippet = False self.cite_uri = False # stable identifier as an HTTP uri self.other_attributes = False # other attributes to the record # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.attribute_hierarchies = {} self.base_url = settings.CANONICAL_HOST rp = RootPath() self.base_url = rp.get_baseurl() self.mem_cache_obj = MemoryCache() # memory caching object self.request_dict_json = request_dict_json if request_dict_json is not False: self.request_dict = json.loads(request_dict_json) else: self.request_dict = False self.highlighting = False self.recursive_count = 0 self.min_date = False self.max_date = False self.thumbnail_data = {} self.media_file_data = {} self.string_attrib_data = {} def parse_solr_record(self, solr_rec): """ Parses a solr rec object """ if isinstance(solr_rec, dict): self.get_item_basics(solr_rec) self.get_citation_uri(solr_rec) self.get_lat_lon(solr_rec) self.get_category(solr_rec) self.get_project(solr_rec) self.get_context(solr_rec) self.get_time(solr_rec) # get time information, limiting date ranges to query constaints self.get_thumbnail(solr_rec) self.get_media_files(solr_rec) self.get_snippet(solr_rec) # get snippet of highlighted text self.get_attributes(solr_rec) # get non-standard attributes self.get_string_attributes(solr_rec) # get non-standard string attributes def get_item_basics(self, solr_rec): """ get basic metadata for an item """ output = False if isinstance(solr_rec, dict): if 'uuid' in solr_rec: self.uuid = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label']) if id_parts is not False: output = True self.uri = self.make_url_from_val_string(id_parts['uri'], True) self.href = self.make_url_from_val_string(id_parts['uri'], False) item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True) self.item_type = item_type_output['item_type'] self.label = id_parts['label'] if 'updated' in solr_rec: self.updated = solr_rec['updated'] if 'published' in solr_rec: self.published = solr_rec['published'] return output def get_snippet(self, solr_rec): """ get a text highlighting snippet """ if isinstance(self.highlighting, dict): if self.uuid is False: if 'uuid' in solr_rec: self.uuid = solr_rec['uuid'] if self.uuid in self.highlighting: if 'text' in self.highlighting[self.uuid]: text_list = self.highlighting[self.uuid]['text'] self.snippet = ' '.join(text_list) # some processing to remove fagments of HTML markup. self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]') self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]') try: self.snippet = '<div>' + self.snippet + '</div>' self.snippet = lxml.html.fromstring(self.snippet).text_content() self.snippet = strip_tags(self.snippet) except: self.snippet = strip_tags(self.snippet) self.snippet = self.snippet.replace('[[[[mark]]]]', '<em>') self.snippet = self.snippet.replace('[[[[/mark]]]]', '</em>') def get_citation_uri(self, solr_rec): """ gets the best citation / persistent uri for the item """ if 'persistent_uri' in solr_rec: for p_uri in solr_rec['persistent_uri']: self.cite_uri = p_uri if 'dx.doi.org' in p_uri: break # stop looking once we have a DOI, the best def get_lat_lon(self, solr_rec): """ gets latitute and longitude information """ if 'discovery_geolocation' in solr_rec: geo_strings = solr_rec['discovery_geolocation'] geo_coords_str = geo_strings.split(',') # NOT geojson ording, since solr uses lat/lon ordering self.latitude = float(geo_coords_str[0]) self.longitude = float(geo_coords_str[1]) def get_category(self, solr_rec): """ Gets the most specific category for the item """ self.recursive_count = 0 cat_hierarchy = self.get_category_hierarchy(solr_rec) if len(cat_hierarchy) > 0: self.category = cat_hierarchy[-1]['label'] def get_context(self, solr_rec): """ Get the most specific context parent for the record """ self.recursive_count = 0 contexts = self.extract_hierarchy(solr_rec, SolrDocument.ROOT_CONTEXT_SOLR, '___context', []) if len(contexts) > 0: self.context_label = self.make_context_path_label(contexts) self.context_uri = self. make_context_link(contexts, True) self.context_href = self. make_context_link(contexts, False) def get_project(self, solr_rec): """ Get the most specific project for the record """ self.recursive_count = 0 projects = self.extract_hierarchy(solr_rec, SolrDocument.ROOT_PROJECT_SOLR, '___project', []) if len(projects) > 0: self.project_label = projects[-1]['label'] self.project_uri = self.make_url_from_val_string(projects[-1]['uri'], True) self.project_href = self.make_url_from_val_string(projects[-1]['uri'], False) def get_time(self, solr_rec): """ parses time information """ early_list = False late_list = False if 'form_use_life_chrono_earliest' in solr_rec: early_list = solr_rec['form_use_life_chrono_earliest'] if 'form_use_life_chrono_latest' in solr_rec: late_list = solr_rec['form_use_life_chrono_latest'] if isinstance(early_list, list): date_list = early_list else: date_list = [] if isinstance(late_list, list): date_list += late_list if len(date_list) > 0: min_max = self.get_list_min_max(date_list) self.early_date = min(min_max) self.late_date = max(min_max) def get_list_min_max(self, date_list): """ Returns the minimum and maximum dates from a date list, constrained by preset min and max dates """ min_date = False max_date = False # print(str(date_list)) if isinstance(date_list, list): date_list.sort() for date in date_list: if self.min_date is not False: if date >= self.min_date \ and min_date is False: min_date = date if self.max_date is not False: if date <= self.max_date: max_date = date if min_date is False: min_date = self.min_date if max_date is False: max_date = self.max_date return [min_date, max_date] def get_thumbnail(self, solr_rec): """ get media record and thumbnai if it exists """ if 'uuid' in solr_rec: uuid = solr_rec['uuid'] if uuid in self.thumbnail_data: if self.thumbnail_data[uuid] is not False: self.thumbnail_href = self.thumbnail_data[uuid]['href'] self.thumbnail_uri = self.thumbnail_data[uuid]['uri'] self.thumbnail_scr = self.thumbnail_data[uuid]['scr'] rp = RootPath() self.thumbnail_scr = rp.convert_to_https(self.thumbnail_scr) else: # did not precache thumbnail data, get an indivitual record self.get_thumbnail_from_database(solr_rec) def get_media_files(self, solr_rec): """ get media record and thumbnai if it exists """ if 'uuid' in solr_rec: uuid = solr_rec['uuid'] if uuid in self.media_file_data: if self.media_file_data[uuid] is not False: rp = RootPath() for file_type, file_uri in self.media_file_data[uuid].items(): if file_type == 'oc-gen:thumbnail': self.thumbnail_scr = rp.convert_to_https(file_uri) elif file_type == 'oc-gen:preview': self.preview_scr = rp.convert_to_https(file_uri) elif file_type == 'oc-gen:fullfile': self.fullfile_scr = rp.convert_to_https(file_uri) def get_thumbnail_from_database(self, solr_rec): """ get media record and thumbnail, if it exists """ if 'uuid' in solr_rec: uuid = solr_rec['uuid'] thumb = [] if self.item_type != 'media': media_item = Assertion.objects\ .filter(uuid=uuid, object_type='media')[:1] if len(media_item) > 0: muuid = media_item[0].object_uuid thumb = Mediafile.objects\ .filter(uuid=muuid, file_type='oc-gen:thumbnail')[:1] else: # do this for media items muuid = uuid thumb = Mediafile.objects\ .filter(uuid=uuid, file_type='oc-gen:thumbnail')[:1] if len(thumb) > 0: self.thumbnail_href = self.base_url + '/media/' + muuid self.thumbnail_uri = settings.CANONICAL_HOST + '/media/' + muuid self.thumbnail_scr = thumb[0].file_uri def get_category_hierarchy(self, solr_rec): """ gets the most specific category informtation about an item """ cat_hierarchy = [] if 'item_type' in solr_rec: item_type = solr_rec['item_type'][0] root_cat_field = 'oc_gen_' + item_type + '___pred_id' cat_hierarchy = self.extract_hierarchy(solr_rec, root_cat_field, '___pred', []) return cat_hierarchy """ The following seciton of code processes non-default attributes for records """ def get_attributes(self, solr_rec): """ gets attributes for a record, based on the predicates requested in the search and optional predicates passed by a client with a GET request with parameter 'attributes' """ qm = QueryMaker() solr_field_entities = {} for attribute in self.rec_attributes: entity = self.mem_cache_obj.get_entity(attribute, False) if entity is not False: prop_slug = entity.slug # check to make sure we have the entity data type for linked fields if entity.data_type is False and entity.item_type == 'uri': dtypes = self.mem_cache_obj.get_dtypes(entity.uri) if isinstance(dtypes, list): # set te data type and the act-field # print('Found for ' + prop_slug + ' ' + dtypes[0]) entity.data_type = dtypes[0] field_parts = qm.make_prop_solr_field_parts(entity) solr_field = field_parts['prefix'] + '___pred_' + field_parts['suffix'] # print('Found: ' + solr_field) # extract children of the solr_field so we know if # we have the most specific attributes, then we can get # values for the most specific attributes self.extract_attribute_children(solr_rec, solr_field) self.clean_attribute_hiearchies() if isinstance(self.attribute_hierarchies, dict): self.other_attributes = [] for field_slug_key, values in self.attribute_hierarchies.items(): entity = self.mem_cache_obj.get_entity(field_slug_key, False) if entity is not False: attribute_dict = LastUpdatedOrderedDict() attribute_dict['property'] = entity.label attribute_dict['values_list'] = [] attribute_dict['value'] = '' string_val = False delim = '' for val in values: if isinstance(val, str): string_val = True parsed_val = self.parse_solr_value_parts(val) attribute_dict["values_list"].append(parsed_val['label']) attribute_dict['value'] += delim + str(parsed_val['label']) else: attribute_dict["values_list"].append(val) attribute_dict['value'] += delim + str(val) delim = self.ATTRIBUTE_DELIM if len(values) == 1 \ and string_val is False: attribute_dict['value'] = values[0] self.other_attributes.append(attribute_dict) def get_string_attributes(self, solr_rec): """ gets string attributes for a solr rec, from a previous database query needed because solr does not cache string field data """ if isinstance(self.string_attrib_data, dict): # now add predicate attributes for string predicates, from the database if 'uuid' in solr_rec and 'data' in self.string_attrib_data: uuid = solr_rec['uuid'] if uuid in self.string_attrib_data['data']: item_data = self.string_attrib_data['data'][uuid] for pred_uuid, values_list in item_data.items(): act_attribute = self.string_attrib_data['pred_ents'][pred_uuid] act_attribute['values_list'] = values_list act_attribute['value'] = self.ATTRIBUTE_DELIM.join(values_list) self.other_attributes.append(act_attribute) def prevent_attribute_key_collision(self, item_prop_dict, prop_key): """ checks to make sure there's no collision between the prop_key and the dict that it will be added to """ i = 2 output_prop_key = prop_key while output_prop_key in item_prop_dict: output_prop_key = prop_key + '[' + str(i) + ']' i += 1 return output_prop_key def clean_attribute_hiearchies(self): """ some post-processing to make sure we have clean attribute hierarchies """ if isinstance(self.attribute_hierarchies, dict): # print('check: ' + str(self.attribute_hierarchies)) temp_attribute_hierarchies = self.attribute_hierarchies clean_attribute_hiearchies = {} for solr_field_key, field_char in self.attribute_hierarchies.items(): if field_char['most-specific']: par_field_ex = solr_field_key.split('___') # last two parts make the suffix, a pred-slug[-2] and a field type [-1] pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1] specific_ok = True for val in field_char['values']: if isinstance(val, str): # print('check:' + solr_field_key + ' val: ' + val) parsed_val = self.parse_solr_value_parts(val) check_field = parsed_val['slug'].replace('-', '_') check_field += '___pred_' + parsed_val['data_type'] if check_field in temp_attribute_hierarchies: # note a field is NOT at the most specific level specific_ok = False else: # now check a version with the predicate as part of # the solr field check_field = parsed_val['slug'].replace('-', '_') check_field += pred_suffix if check_field in temp_attribute_hierarchies: # note a field is NOT at the most specific level specific_ok = False if specific_ok: # ok to add # print('checked OK: ' + solr_field_key) clean_attribute_hiearchies[solr_field_key] = field_char # now that we got rid of problem fields, lets sort these for consistent # rendering self.attribute_hierarchies = LastUpdatedOrderedDict() keys = LastUpdatedOrderedDict() # order of key types, we want id fields, followed by numeric then date key_types = ['___pred_id', '___pred_numeric', '___pred_date'] for key_type in key_types: keys[key_type] = [] for solr_field_key, field_char in clean_attribute_hiearchies.items(): if key_type in solr_field_key: keys[key_type].append(solr_field_key) # sort alphabetically. Slugs useful, since they will cluster predicates # from similar vocabularies keys[key_type].sort() for key in keys[key_type]: field_char = clean_attribute_hiearchies[key] field_ex = key.split('___') # the penultimate part is the predicate field_slug = field_ex[-2].replace('_', '-') if field_slug not in self.attribute_hierarchies: self.attribute_hierarchies[field_slug] = [] for val in field_char['values']: if val not in self.attribute_hierarchies[field_slug]: self.attribute_hierarchies[field_slug].append(val) def extract_attribute_children(self, solr_rec, solr_field_key): """ extracts ALL children from the hiearchy of a solr_field_key """ is_field = False if solr_field_key not in self.attribute_hierarchies: # so we don't look at the same thing twice! if solr_field_key in solr_rec: is_field = True field_char = {'most-specific': False, 'values': []} if '___pred_numeric' in solr_field_key \ or '___pred_numeric' in solr_field_key: field_char['most-specific'] = True field_char['values'] = solr_rec[solr_field_key] elif '___pred_id' in solr_field_key: # make a suffix for the par_field_ex = solr_field_key.split('___') # last two parts make the suffix, a pred-slug[-2] and a field type [-1] pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1] childless_children = [] for child_val in solr_rec[solr_field_key]: # print('Child: ' + solr_field_key + ': ' + child_val) parsed_path_item = self.parse_solr_value_parts(child_val) new_field_prefix = parsed_path_item['slug'].replace('-', '_') new_field_key = new_field_prefix + '___pred_' + parsed_path_item['data_type'] if parsed_path_item['data_type'] == 'id': child_is_field = self.extract_attribute_children(solr_rec, new_field_key) if child_is_field is False: # now check an alternative combining the child # slug with the predicate of the parent new_field_key = new_field_prefix + pred_suffix # print('check: ' + new_field_key) child_is_field = self.extract_attribute_children(solr_rec, new_field_key) if child_is_field is False: childless_children.append(child_val) if len(childless_children) > 0: field_char['most-specific'] = True field_char['values'] = childless_children else: pass self.attribute_hierarchies[solr_field_key] = field_char return is_field def extract_hierarchy(self, solr_rec, facet_field_key, facet_suffix, hierarchy=[], pred_field=False): """ extracts a hierarchy from a solr_record. The output is a list starting with the most general parent of the hiearchy, then going to the most specific This is a recursive function and default / starts with the root of the hiearchy as the facet_field_key This only follows a single path (not multiple paths) """ alt_facet_field_key = facet_field_key if pred_field is not False: # do this to allow search of hiarchy in a named # predicate field f_parts = facet_field_key.split('___') if len(f_parts) == 2: alt_f_parts = [f_parts[0], pred_field.replace('-', '_'), f_parts[1]] alt_facet_field_key = '___'.join(alt_f_parts) # print('Check: ' + facet_field_key + ', ' + alt_facet_field_key) if (facet_field_key in solr_rec or alt_facet_field_key in solr_rec)\ and self.recursive_count < 20: self.recursive_count += 1 if facet_field_key in solr_rec: path_item_val = solr_rec[facet_field_key][0] else: path_item_val = solr_rec[alt_facet_field_key][0] parsed_path_item = self.parse_solr_value_parts(path_item_val) if isinstance(parsed_path_item, dict): hierarchy.append(parsed_path_item) new_facet_field = parsed_path_item['slug'].replace('-', '_') new_facet_field += facet_suffix + '_' + parsed_path_item['data_type'] # print('New hierarchy field: ' + new_facet_field) hierarchy = self.extract_hierarchy(solr_rec, new_facet_field, facet_suffix, hierarchy) return hierarchy def make_context_path_label(self, contexts): """ Makes a '/' delimited context path for easy human readability """ context_path = False if len(contexts) > 0: context_labels = [] for context in contexts: context_labels.append(context['label']) context_path = '/'.join(context_labels) return context_path def make_context_link(self, contexts, cannonical=False): """ makes a URI for a context """ context_uri = False if len(contexts) > 0: context_uri = self.make_url_from_val_string(contexts[-1]['uri'], cannonical) return context_uri def make_url_from_val_string(self, partial_url, use_cannonical=True): """ parses a solr value if it has '___' delimiters, to get the URI part string. if it's already a URI part, it makes a URL """ if use_cannonical: base_url = settings.CANONICAL_HOST else: base_url = self.base_url solr_parts = self.parse_solr_value_parts(partial_url) if isinstance(solr_parts, dict): partial_url = solr_parts['uri'] if 'http://' not in partial_url \ and 'https://' not in partial_url: url = base_url + partial_url else: url = partial_url return url def add_record_fields(self): """ adds fields to include in the GeoJSON properties """ if 'rec-field' in self.response_dict: raw_rec_fields = self.response_dict['rec-field'][0] if ',' in raw_rec_fields: self.record_fields = raw_rec_fields.split(',') else: self.record_fields = [raw_rec_fields] else: self.record_fields = [] return self.record_fields def parse_solr_value_parts(self, solr_value): """ parses a solr_value string into slug, solr-data-type, uri, and label parts """ output = False if isinstance(solr_value, str): if '___' in solr_value: solr_ex = solr_value.split('___') if len(solr_ex) == 4: output = {} output['slug'] = solr_ex[0] output['data_type'] = solr_ex[1] output['uri'] = solr_ex[2] output['label'] = solr_ex[3] else: output = solr_value else: output = solr_value return output def get_solr_record_uuid_type(self, solr_rec): """ get item uuid, label, and type from a solr_rec """ output = False if isinstance(solr_rec, dict): output = {'uuid': False, 'label': False, 'item_type': False} if 'uuid' in solr_rec: output['uuid'] = solr_rec['uuid'] if 'slug_type_uri_label' in solr_rec: id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label']) if id_parts is not False: uri = self.make_url_from_val_string(id_parts['uri'], True) item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True) output['item_type'] = item_type_output['item_type'] output['label'] = id_parts['label'] return output def get_key_val(self, key, dict_obj): """ returns the value associated with a key, if the key exists else, none """ output = None if isinstance(dict_obj, dict): if key in dict_obj: output = dict_obj[key] return output
class SolrUUIDs(): """ methods to make get UUIDs from a solr search result JSON document, also makes URIs """ def __init__(self, response_dict_json=False): rp = RootPath() self.base_url = rp.get_baseurl() self.uuids = [] self.uris = [] self.mem_cache_obj = MemoryCache() # memory caching object self.response_dict_json = response_dict_json self.highlighting = False # make values to these fields "flat" not a list self.flatten_rec_fields = True self.total_found = False self.rec_start = False self.min_date = False self.max_date = False # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.do_media_thumbs = True # get thumbnails for records self.get_all_media = False # get links to all media files for an item def make_uuids_from_solr(self, solr_json): """ makes geojson-ld point records from a solr response """ #first do lots of checks to make sure the solr-json is OK solr_recs = self.extract_solr_recs(solr_json) if isinstance(solr_recs, list): for solr_rec in solr_recs: if 'uuid' in solr_rec: uuid = solr_rec['uuid'] self.uuids.append(uuid) return self.uuids def make_uris_from_solr(self, solr_json, uris_only=True): """ processes the solr_json to make GeoJSON records """ solr_recs = self.extract_solr_recs(solr_json) if isinstance(solr_recs, list): if uris_only: self.do_media_thumbs = False if self.get_all_media: self.do_media_thumbs = False if 'thumbnail' in self.rec_attributes: self.do_media_thumbs = True thumbnail_data = self.get_media_thumbs(solr_recs) media_file_data = self.get_all_media_files(solr_recs) string_attrib_data = self.get_string_rec_attributes(solr_recs) for solr_rec in solr_recs: rec_props_obj = RecordProperties(self.response_dict_json) rec_props_obj.mem_cache_obj = self.mem_cache_obj rec_props_obj.min_date = self.min_date rec_props_obj.max_date = self.max_date rec_props_obj.highlighting = self.highlighting rec_props_obj.flatten_rec_attributes = self.flatten_rec_attributes rec_props_obj.rec_attributes = self.rec_attributes rec_props_obj.thumbnail_data = thumbnail_data rec_props_obj.media_file_data = media_file_data rec_props_obj.string_attrib_data = string_attrib_data item_ok = rec_props_obj.get_item_basics(solr_rec) if item_ok: if uris_only: item = rec_props_obj.uri else: rec_props_obj.parse_solr_record(solr_rec) self.mem_cache_obj = rec_props_obj.mem_cache_obj # add to existing list of entities, reduce lookups item = self.make_item_dict_from_rec_props_obj(rec_props_obj) self.uris.append(item) return self.uris def make_item_dict_from_rec_props_obj(self, rec_props_obj, cannonical=True): """ makes item dictionary object from a record prop obj """ item = LastUpdatedOrderedDict() item['uri'] = rec_props_obj.uri if cannonical is False or 'href' in self.rec_attributes: item['href'] = rec_props_obj.href item['citation uri'] = rec_props_obj.cite_uri item['label'] = rec_props_obj.label item['project label'] = rec_props_obj.project_label if cannonical: item['project uri'] = rec_props_obj.project_uri else: item['project href'] = rec_props_obj.project_href item['context label'] = rec_props_obj.context_label if cannonical: item['context uri'] = rec_props_obj.context_uri else: item['context href'] = rec_props_obj.context_href item['latitude'] = rec_props_obj.latitude item['longitude'] = rec_props_obj.longitude item['early bce/ce'] = rec_props_obj.early_date item['late bce/ce'] = rec_props_obj.late_date item['item category'] = rec_props_obj.category if rec_props_obj.snippet is not False: item['snippet'] = rec_props_obj.snippet if rec_props_obj.thumbnail_scr is not False: item['thumbnail'] = rec_props_obj.thumbnail_scr if rec_props_obj.preview_scr is not False: item['preview'] = rec_props_obj.preview_scr if rec_props_obj.fullfile_scr is not False: item['primary-file'] = rec_props_obj.fullfile_scr item['published'] = rec_props_obj.published item['updated'] = rec_props_obj.updated if isinstance(rec_props_obj.other_attributes, list): for attribute in rec_props_obj.other_attributes: prop_key = attribute['property'] prop_key = rec_props_obj.prevent_attribute_key_collision(item, prop_key) if self.flatten_rec_attributes: if 'value' in attribute: item[prop_key] = attribute['value'] elif 'values_list' in attribute: item[prop_key] = RecordProperties.ATTRIBUTE_DELIM.join(attribute['values_list']) else: item[prop_key] = attribute['values_list'] return item def extract_solr_recs(self, solr_json): """ extracts solr_recs along with some basic metadata from solr_json """ solr_recs = False if isinstance(solr_json, dict): try: self.total_found = solr_json['response']['numFound'] except KeyError: self.total_found = False try: self.rec_start = solr_json['response']['start'] except KeyError: self.rec_start = False try: self.highlighting = solr_json['highlighting'] except KeyError: self.highlighting = False try: solr_recs = solr_json['response']['docs'] except KeyError: solr_recs = False return solr_recs def get_media_thumbs(self, solr_recs): """ gets media thumbnail items """ thumb_results = {} not_media_uuids = [] media_uuids = [] rec_props_obj = RecordProperties(self.response_dict_json) for solr_rec in solr_recs: item = rec_props_obj.get_solr_record_uuid_type(solr_rec) if item is not False: uuid = item['uuid'] if item['item_type'] != 'media': not_media_uuids.append(uuid) else: media_uuids.append(uuid) thumb_results[uuid] = False if len(not_media_uuids) > 0: if self.do_media_thumbs: # only get media_thumbnails if needed rows = self.get_thumbs_for_non_media(not_media_uuids) for row in rows: uuid = row['uuid'] thumb_obj = {} thumb_obj['href'] = self.base_url + '/media/' + row['media_uuid'] thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + row['media_uuid'] thumb_obj['scr'] = row['file_uri'] if thumb_results[uuid] is False: thumb_results[uuid] = thumb_obj if len(media_uuids) > 0: thumbs = Mediafile.objects\ .filter(uuid__in=media_uuids, file_type='oc-gen:thumbnail') for thumb in thumbs: uuid = thumb.uuid thumb_obj = {} thumb_obj['href'] = self.base_url + '/media/' + thumb.uuid thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + thumb.uuid thumb_obj['scr'] = thumb.file_uri thumb_results[uuid] = thumb_obj return thumb_results def get_all_media_files(self, solr_recs): """ gets media thumbnail items """ media_file_results = {} if self.get_all_media: media_uuids = [] rec_props_obj = RecordProperties(self.response_dict_json) for solr_rec in solr_recs: item = rec_props_obj.get_solr_record_uuid_type(solr_rec) if item is not False: uuid = item['uuid'] if item['item_type'] == 'media': media_uuids.append(uuid) media_file_results[uuid] = False if len(media_uuids) > 0: media_files = Mediafile.objects\ .filter(uuid__in=media_uuids) for media_file in media_files: uuid = media_file.uuid if uuid not in media_file_results: media_file_results[uuid] = {} else: if media_file_results[uuid] is False: media_file_results[uuid] = {} media_file_results[uuid][media_file.file_type] = media_file.file_uri return media_file_results def get_thumbs_for_non_media(self, uuid_list): q_uuids = self.make_query_uuids(uuid_list) query = ('SELECT ass.uuid AS uuid, m.file_uri AS file_uri, ' 'm.uuid AS media_uuid ' 'FROM oc_assertions AS ass ' 'JOIN oc_mediafiles AS m ON ass.object_uuid = m.uuid ' 'AND m.file_type=\'oc-gen:thumbnail\' ' 'WHERE ass.uuid IN (' + q_uuids + ') ' 'GROUP BY ass.uuid, m.file_uri, m.uuid; ') cursor = connection.cursor() cursor.execute(query) rows = self.dictfetchall(cursor) return rows def make_query_uuids(self, uuid_list): """ makes a string for uuid list query """ uuid_q = [] for uuid in uuid_list: uuid = '\'' + uuid + '\'' uuid_q.append(uuid) return ', '.join(uuid_q) def dictfetchall(self, cursor): """ Return all rows from a cursor as a dict """ columns = [col[0] for col in cursor.description] return [ dict(zip(columns, row)) for row in cursor.fetchall() ] def get_string_rec_attributes(self, solr_recs): """ gets string record attributes from the database. The solr index does not keep string-fields in memory """ output = {} str_attribs = {} for attribute in self.rec_attributes: entity = self.mem_cache_obj.get_entity(attribute, False) if entity is not False: prop_slug = entity.slug # check to make sure we have the entity data type for linked fields if entity.data_type is False and entity.item_type == 'uri': dtypes = self.mem_cache_obj.get_dtypes(entity.uri) if isinstance(dtypes, list): # set te data type and the act-field # print('Found for ' + prop_slug + ' ' + dtypes[0]) entity.data_type = dtypes[0] if entity.data_type == 'xsd:string': str_attribs[attribute] = entity if len(str_attribs) > 0: uuid_list = [] for solr_rec in solr_recs: if 'uuid' in solr_rec: uuid = str(solr_rec['uuid']) uuid_list.append(uuid) output = self.get_string_attributes(uuid_list, str_attribs) return output def get_string_attributes(self, uuid_list, str_attribute_ent_dict): """ Gets string attribute data for a solr dict """ output = {} pred_uuid_list = [] pred_uuid_objs = {} for key, entity in str_attribute_ent_dict.items(): if isinstance(entity.uuid, str): # add string predicate entity uuid to the list pred_uuid_list.append(entity.uuid) pred_uuid_objs[entity.uuid] = {'rec_attribute': key, 'property': entity.label, 'pred_uuid': entity.uuid, 'slug': entity.slug} if len(pred_uuid_list) > 0 and len(uuid_list) > 0: q_rows = self. get_string_attributes_sql(uuid_list, pred_uuid_list) dict_rows = {} for row in q_rows: # print(str(row)) # the whole "dict row" bullshit is because for some reason # we can't simply append to the output of the uuid = row['uuid'] pred_uuid = row['predicate_uuid'] content = row['content'] if uuid not in dict_rows: dict_rows[uuid] = {} if pred_uuid not in dict_rows[uuid]: dict_rows[uuid][pred_uuid] = [] if isinstance(content, str): dict_rows[uuid][pred_uuid].append(content) # print(str(dict_rows[uuid][pred_uuid])) output = {'pred_ents': pred_uuid_objs, 'data': dict_rows} return output def get_string_attributes_sql(self, uuid_list, pred_uuid_list): """ executes SQL query to get strings for the solr uuids and predicates """ q_uuids = self.make_query_uuids(uuid_list) p_uuids = self.make_query_uuids(pred_uuid_list) query = ('SELECT ass.uuid AS uuid, ass.predicate_uuid AS predicate_uuid, ' 's.content AS content ' 'FROM oc_assertions AS ass ' 'JOIN oc_strings AS s ON ass.object_uuid = s.uuid ' 'WHERE ass.uuid IN (' + q_uuids + ') AND ' 'ass.predicate_uuid IN (' + p_uuids + ')' 'ORDER BY ass.uuid, ass.predicate_uuid, s.content; ') cursor = connection.cursor() cursor.execute(query) rows = self.dictfetchall(cursor) return rows
class ReadProjectContextVocabGraph(): """ Methods to read the project context vocabulary graph """ GLOBAL_VOCAB_GRAPH = [ { '@id': 'oc-pred:link', 'owl:sameAs': 'http://opencontext.org/predicates/oc-3', 'label': 'link', 'slug': 'link', 'oc-gen:predType': 'link', '@type': '@id' }, { '@id': Assertion.PREDICATES_NOTE, 'label': 'Note', 'owl:sameAs': False, 'slug': 'oc-gen-has-note', '@type': 'xsd:string' }, ] # predicates used for equivalence, used to make # inferred assertions REL_PREDICATES_FOR_INFERRENCE = ['skos:closeMatch', 'skos:exactMatch'] REL_MEASUREMENTS = [ 'cidoc-crm:P67_refers_to', 'oc-gen:has-technique', 'rdfs:range' ] ITEM_REL_PREDICATES = [ 'skos:closeMatch', 'skos:exactMatch', 'owl:sameAs', 'skos:related', 'skos:broader', 'dc-terms:references', 'dc-terms:hasVersion', 'http://nomisma.org/ontology#hasTypeSeriesItem' ] # Skip the following predicate keys when looking # for inferred linked data assertions in an observation. LINKDATA_OBS_PREDS_SKIP = [ 'id', 'type', ItemKeys.PREDICATES_OCGEN_SOURCEID, ItemKeys.PREDICATES_OCGEN_OBSTATUS, ItemKeys.PREDICATES_OCGEN_OBSLABEL, ItemKeys.PREDICATES_OCGEN_OBSNOTE, ] def __init__(self, proj_context_json_ld=None): self.m_cache = MemoryCache() self.context = None self.graph = None self.fail_on_missing_entities = False if not isinstance(proj_context_json_ld, dict): return None if '@context' in proj_context_json_ld: self.context = proj_context_json_ld['@context'] if '@graph' in proj_context_json_ld: self.graph = self.GLOBAL_VOCAB_GRAPH + proj_context_json_ld[ '@graph'] else: self.graph = self.GLOBAL_VOCAB_GRAPH logger.info('Read project graph size: {}'.format(len(self.graph))) def lookup_predicate(self, id): """looks up an Open Context predicate by an identifier (slud id, uri, slug, or uuid) """ output = self.lookup_oc_descriptor(id, 'predicates') return output def lookup_type(self, id): """looks up an Open Context type by an identifier (slud id, uri, slug, or uuid) """ output = self.lookup_oc_descriptor(id, 'types') return output def lookup_type_by_type_obj(self, type_obj): """looks up an Open Context type to get more information, including linked data equivalents by looking up the a type from how it is used as the object of a descriptive predicate in an observation """ type_ids = self.get_id_list_for_g_obj(type_obj) for type_id in type_ids: found_type_obj = self.lookup_type(type_id) if isinstance(found_type_obj, dict): return found_type_obj return type_obj def lookup_oc_descriptor(self, id, item_type): """looks up a predicate, or a type by an identifier (slud id, uri, slug, or uuid) """ cache_key = self.m_cache.make_cache_key( 'lookup_oc_descriptor_{}'.format(item_type), id) output = self.m_cache.get_cache_object(cache_key) if (output is None and isinstance(self.graph, list) and isinstance(id, str)): for g_obj in self.graph: id_list = self.get_id_list_for_g_obj(g_obj) if not id in id_list: continue output = g_obj if item_type == 'predicates' and '@type' not in g_obj: output[ '@type'] = self.get_predicate_datatype_for_graph_obj( g_obj) break if output: self.m_cache.save_cache_object(cache_key, output) if self.fail_on_missing_entities and not output: raise RuntimeError('Cannot find {}, item_type: {}'.format( id, item_type)) return output def get_predicate_datatype_for_graph_obj(self, g_obj): """ looks up a predicate data type for a given graph object """ slug_uri = self.get_id_from_g_obj(g_obj) datatype = self.get_predicate_datatype_by_slug_uri(slug_uri) return datatype def get_id_list_for_g_obj(self, g_obj): """gets a list of ids for an object""" id_list = [] id_keys = ['@id', 'id', 'owl:sameAs', 'slug', 'uuid'] if isinstance(g_obj, dict): for id_key in id_keys: if not id_key in g_obj: continue if g_obj[id_key] not in id_list: id_list.append(g_obj[id_key]) return id_list def get_id_from_g_obj(self, g_obj): """ gets the id form a g_obj, either the @id or id varient """ id_variants = ['@id', 'id'] id = None if not isinstance(g_obj, dict): return None for id_variant in id_variants: if id_variant not in g_obj: continue id = g_obj[id_variant] return id def get_predicate_datatype_by_slug_uri(self, slug_uri): """Looks up a predicate's datatype via the predicate slug URI.""" datatype = 'xsd:string' # Default to treating all as a string if (isinstance(self.context, dict) and isinstance(slug_uri, str)): if not slug_uri in self.context: return datatype for type_variant in ['@type', 'type']: if type_variant not in self.context[slug_uri]: continue datatype = self.context[slug_uri][type_variant] return datatype def get_equivalent_objects(self, info_dict): """ Gets equivalent linked data dicts associated with an info_dict. """ equiv_uris = [] equiv_objects = [] for rel_pred in self.REL_PREDICATES_FOR_INFERRENCE: if not rel_pred in info_dict: continue for equiv_obj in info_dict[rel_pred]: equiv_uri = self.get_id_from_g_obj(equiv_obj) if equiv_uri and equiv_uri not in equiv_uris: # Make sure that the equivalent URIs are unique. equiv_uris.append(equiv_uri) equiv_objects.append(equiv_obj) return equiv_objects def infer_assertions_for_item_json_ld(self, json_ld): """Makes a list of inferred assertions from item json ld """ lang_obj = Languages() inferred_assertions = [] if not isinstance(json_ld, dict): return inferred_assertions if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld: return inferred_assertions unique_pred_assertions = LastUpdatedOrderedDict() for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]: # Get the status of the observation, defaulting to 'active'. If # active, then it's OK to infer assertions, otherwise skip the # observation. obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS, 'active') if obs_status != 'active': # Skip this observation. It's there but has a deprecated # status. continue for obs_pred_key, obj_values in obs_dict.items(): if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP: # Skip this obs_pred_key, it is a general # description of the observation, and will # not have any linked assertions to infer. continue obs_pred_info = self.lookup_predicate(obs_pred_key) pred_data_type = self.get_predicate_datatype_for_graph_obj( obs_pred_info) if not obs_pred_info: continue equiv_pred_objs = self.get_equivalent_objects(obs_pred_info) if not equiv_pred_objs: # No linked data equivalence for the obs_pred_key # so continue, skipping the rest. continue # Start with a None assertion. assertion = None # Iterate through all the equivalent predicate objects. for equiv_pred_obj in equiv_pred_objs: equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj) # Inferred assertions will have unique LOD predicates, with # one or more values. The unique_pred_assertions dict makes # sure the LOD predicates are used only once. if not equiv_pred_uri in unique_pred_assertions: assertion = equiv_pred_obj assertion['type'] = pred_data_type assertion['ld_objects'] = LastUpdatedOrderedDict() assertion['oc_objects'] = LastUpdatedOrderedDict() assertion['literals'] = [] unique_pred_assertions[equiv_pred_uri] = assertion assertion = unique_pred_assertions[equiv_pred_uri] if assertion and equiv_pred_uri: # we have a LOD equvalient property if not isinstance(obj_values, list): obj_values = [obj_values] for obj_val in obj_values: literal_val = None if not isinstance(obj_val, dict): # the object of the assertion is not a dict, so it must be # a literal literal_val = obj_val if obj_val not in assertion['literals']: assertion['literals'].append(obj_val) elif 'xsd:string' in obj_val: literal_val = lang_obj.get_all_value_str( obj_val['xsd:string']) if literal_val and literal_val not in assertion[ 'literals']: assertion['literals'].append(literal_val) if literal_val is None: # Add any linked data equivalences by looking for this # type in the graph list obj_val = self.lookup_type_by_type_obj(obj_val) obj_uri = self.get_id_from_g_obj(obj_val) equiv_obj_objs = self.get_equivalent_objects( obj_val) if len(equiv_obj_objs): # We have LD equivalents for the object value for equiv_obj_obj in equiv_obj_objs: equiv_obj_uri = self.get_id_from_g_obj( equiv_obj_obj) if not biological_taxonomy_validation( equiv_pred_uri, equiv_obj_uri): # This object_uri does not belong to this # predicated uri. continue assertion['ld_objects'][ equiv_obj_uri] = equiv_obj_obj elif obj_uri: # We don't have LD equivalents for the object value # add to the oc_objects assertion['oc_objects'][obj_uri] = obj_val unique_pred_assertions[ equiv_pred_uri] = assertion for pred_key, assertion in unique_pred_assertions.items(): inferred_assertions.append(assertion) return inferred_assertions
class LinkRecursion(): """ Does recursive look ups on link annotations, especially to find hierarchies from opencontext_py.apps.ldata.linkannotations.recursion import LinkRecursion lr = LinkRecursion() lr.get_jsonldish_entity_parents('oc-gen:cat-bio-subj-ecofact') lr = LinkRecursion() lr.get_jsonldish_entity_parents('oc-gen:cat-arch-element') lr = LinkRecursion() lr.get_jsonldish_entity_parents('http://eol.org/pages/7680') lr = LinkRecursion() lr.get_entity_children('http://eol.org/pages/4077', True) """ def __init__(self): self.m_cache = MemoryCache() self.parent_entities = None self.child_entities = None # cache prefix for the json-ldish-parents self.jsonldish_p_prefix = 'json-ldish-parents-{}' # cache prefix for list of parents self.p_prefix = 'lr-parents' # cache prefix for children of an item self.children_prefix = 'lr-children-{}' # cache prefix for full tree of child items self.child_tree_prefix = 'lr-child-tree-{}' def get_jsonldish_entity_parents(self, identifier, add_original=True): """ Gets parent concepts for a given URI or UUID identified entity returns a list of dictionary objects similar to JSON-LD expectations This is useful for faceted search If add_original is true, add the original UUID for the entity that's the childmost item, at the bottom of the hierarchy """ cache_key = self.m_cache.make_cache_key( self.jsonldish_p_prefix.format(str(add_original)), identifier ) obj = self.m_cache.get_cache_object(cache_key) if obj is not None: return obj # We don't have it cached, so get from the database. obj = self._get_jsonldish_entity_parents_db( identifier, add_original ) if obj: self.m_cache.save_cache_object(cache_key, obj) return obj def _get_jsonldish_entity_parents_db(self, identifier, add_original=True): """ Gets parent concepts for a given URI or UUID identified entity returns a list of dictionary objects similar to JSON-LD expectations This is useful for faceted search If add_original is true, add the original UUID for the entity that's the childmost item, at the bottom of the hierarchy """ output = False if add_original: # add the original identifer to the list of parents, at lowest rank raw_parents = ( [identifier] + self.get_entity_parents(identifier, [], 0) ) else: raw_parents = self.get_entity_parents( identifier, [], 0 ) if not len(raw_parents): # No parents. Returns false. return output # Make the output. # reverse the order of the list, to make top most concept # first output = [] for par_id in raw_parents[::-1]: # print('par_id is: ' + par_id) ent = self.m_cache.get_entity(par_id) if not ent: continue p_item = LastUpdatedOrderedDict() p_item['id'] = ent.uri p_item['slug'] = ent.slug p_item['label'] = ent.label if ent.data_type is not False: p_item['type'] = ent.data_type else: p_item['type'] = '@id' p_item['ld_object_ok'] = ent.ld_object_ok output.append(p_item) return output def get_entity_parents(self, identifier, parent_list=None, loop_count=0): """ Gets parent concepts for a given URI or UUID identified entity """ if not parent_list: parent_list = [] loop_count += 1 parent_id = self._get_parent_id(identifier) # print('ID: {} has parent: {}'.format(identifier, parent_id)) if parent_id: if parent_id not in parent_list: parent_list.append(parent_id) # print('Parent list is: ' + str(parent_list)) if loop_count <= 50: parent_list = self.get_entity_parents(parent_id, parent_list, loop_count) else: # all done, save the parents self.parent_entities = parent_list return parent_list def _get_parent_id(self, identifier): """Get the parent id for the current identifier, or from the cache.""" cache_key = self.m_cache.make_cache_key(self.p_prefix, identifier) obj = self.m_cache.get_cache_object(cache_key) if obj is not None: return obj else: obj = self._get_parent_id_db(identifier) if obj: self.m_cache.save_cache_object(cache_key, obj) return obj def _get_parent_id_db(self, identifier): """Get the parent id for the current identifier """ parent_id = None lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) # print('identifiers: {}'.format(identifiers)) p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ preds_for_superobjs = lequiv.get_identifier_list_variants(p_for_superobjs) p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ preds_for_subobjs = lequiv.get_identifier_list_variants(p_for_subobjs) try: # look for superior items in the objects of the assertion # sorting by sort so we can privelage a certain hierarchy path superobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=preds_for_superobjs)\ .exclude(object_uri__in=identifiers)\ .order_by('sort', 'object_uri')[:1] if len(superobjs_anno) < 1: superobjs_anno = False except LinkAnnotation.DoesNotExist: superobjs_anno = False if superobjs_anno: parent_id = superobjs_anno[0].object_uri # print('Subject {} is child of {}'.format(identifiers, parent_id)) oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if oc_uuid: parent_id = oc_uuid try: """ Now look for superior entities in the subject, not the object sorting by sort so we can privelage a certain hierarchy path """ supersubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=preds_for_subobjs)\ .exclude(subject__in=identifiers)\ .order_by('sort', 'subject')[:1] if len(supersubj_anno) < 1: supersubj_anno = False except LinkAnnotation.DoesNotExist: supersubj_anno = False if supersubj_anno: parent_id = supersubj_anno[0].subject # print('Subject {} is parent of {}'.format(parent_id, identifiers)) oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if oc_uuid: parent_id = oc_uuid return parent_id def get_entity_children(self, identifier, recursive=True): cache_key = self.m_cache.make_cache_key(self.children_prefix.format(str(recursive)), identifier) tree_cache_key = self.m_cache.make_cache_key(self.child_tree_prefix.format(str(recursive)), identifier) obj = self.m_cache.get_cache_object(cache_key) tree_obj = self.m_cache.get_cache_object(tree_cache_key) if obj is not None and tree_obj is not None: # print('Hit child cache on {}'.format(identifier)) self.child_entities = tree_obj # the fill tree of child entities return obj else: obj = self._get_entity_children_db(identifier, recursive) if obj: # print('Hit child DB on {}'.format(identifier)) self.m_cache.save_cache_object(cache_key, obj) self.m_cache.save_cache_object(tree_cache_key, self.child_entities) return obj def _get_entity_children_db(self, identifier, recursive=True): """ Gets child concepts for a given URI or UUID identified entity """ if not self.child_entities: self.child_entities = LastUpdatedOrderedDict() if identifier in self.child_entities and recursive: output = self.child_entities[identifier] else: act_children = [] p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) try: # look for child items in the objects of the assertion subobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=p_for_subobjs) if(len(subobjs_anno) < 1): subobjs_anno = False except LinkAnnotation.DoesNotExist: subobjs_anno = False if subobjs_anno is not False: for sub_obj in subobjs_anno: child_id = sub_obj.object_uri act_children.append(child_id) try: """ Now look for subordinate entities in the subject, not the object """ subsubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=p_for_superobjs) if len(subsubj_anno) < 1: subsubj_anno = False except LinkAnnotation.DoesNotExist: subsubj_anno = False if subsubj_anno is not False: for sub_sub in subsubj_anno: child_id = sub_sub.subject act_children.append(child_id) if len(act_children) > 0: identifier_children = [] for child_id in act_children: if child_id.count('/') > 1: oc_uuid = URImanagement.get_uuid_from_oc_uri(child_id) if oc_uuid: child_id = oc_uuid identifier_children.append(child_id) # recursively get the children of the child if recursive: self.get_entity_children(child_id, recursive) # same the list of children of the current identified item if identifier not in self.child_entities: self.child_entities[identifier] = identifier_children else: # save a False for the current identified item. it has no children if identifier not in self.child_entities: self.child_entities[identifier] = [] output = self.child_entities[identifier] return output def get_pred_top_rank_types(self, predicate_uuid): """ gets the top ranked (not a subordinate) of any other type for a predicate """ types = False try: pred_obj = Predicate.objects.get(uuid=predicate_uuid) except Predicate.DoesNotExist: pred_obj = False if pred_obj is not False: # print('found: ' + predicate_uuid) if pred_obj.data_type == 'id': types = [] id_list = [] pred_types = OCtype.objects\ .filter(predicate_uuid=predicate_uuid) for p_type in pred_types: type_pars = self.get_jsonldish_entity_parents(p_type.uuid) self.parent_entities = [] self.loop_count = 0 if type_pars[0]['id'] not in id_list: # so the top parent is only listed once id_list.append(type_pars[0]['id']) types.append(type_pars[0]) return types def get_entity(self, identifier): """ Gets an entity either from the cache or from database lookups. This is a wrapper for the MemoryCache().get_entity function. """ return self.m_cache.get_entity(identifier)
def __init__(self): self.error = False self.histogram_groups = 10 self.mem_cache_obj = MemoryCache() # memory caching object
def __init__(self, cannonical_uris = False): self.m_cache = MemoryCache()
def __init__(self): self.m_cache = MemoryCache() # memory caching object self.base_search_link = '/search/' self.hierarchy_delim = '---'
class SolrUUIDs(): """ methods to make get UUIDs from a solr search result JSON document, also makes URIs """ def __init__(self, response_dict_json=False): rp = RootPath() self.base_url = rp.get_baseurl() self.uuids = [] self.uris = [] self.m_cache = MemoryCache() # memory caching object self.s_cache = SearchGenerationCache( ) # supplemental caching object, specific for searching self.response_dict_json = response_dict_json self.highlighting = False # make values to these fields "flat" not a list self.flatten_rec_fields = True self.total_found = False self.rec_start = False self.min_date = False self.max_date = False # flatten list of an attribute values to single value self.flatten_rec_attributes = False # A list of (non-standard) attributes to include in a record self.rec_attributes = [] self.do_media_thumbs = True # get thumbnails for records self.get_all_media = False # get links to all media files for an item def make_uuids_from_solr(self, solr_json): """ makes geojson-ld point records from a solr response """ #first do lots of checks to make sure the solr-json is OK solr_recs = self.extract_solr_recs(solr_json) if isinstance(solr_recs, list): for solr_rec in solr_recs: if 'uuid' in solr_rec: uuid = solr_rec['uuid'] self.uuids.append(uuid) return self.uuids def make_uris_from_solr(self, solr_json, uris_only=True): """ processes the solr_json to make GeoJSON records """ solr_recs = self.extract_solr_recs(solr_json) if isinstance(solr_recs, list): if uris_only: self.do_media_thumbs = False if self.get_all_media: self.do_media_thumbs = False if 'thumbnail' in self.rec_attributes: self.do_media_thumbs = True thumbnail_data = self.get_media_thumbs(solr_recs) media_file_data = self.get_all_media_files(solr_recs) string_attrib_data = self.get_string_rec_attributes(solr_recs) for solr_rec in solr_recs: rec_props_obj = RecordProperties(self.response_dict_json) rec_props_obj.min_date = self.min_date rec_props_obj.max_date = self.max_date rec_props_obj.highlighting = self.highlighting rec_props_obj.flatten_rec_attributes = self.flatten_rec_attributes rec_props_obj.rec_attributes = self.rec_attributes rec_props_obj.thumbnail_data = thumbnail_data rec_props_obj.media_file_data = media_file_data rec_props_obj.string_attrib_data = string_attrib_data item_ok = rec_props_obj.get_item_basics(solr_rec) if item_ok: if uris_only: item = rec_props_obj.uri else: rec_props_obj.parse_solr_record(solr_rec) item = self.make_item_dict_from_rec_props_obj( rec_props_obj) self.uris.append(item) return self.uris def make_item_dict_from_rec_props_obj(self, rec_props_obj, cannonical=True): """ makes item dictionary object from a record prop obj """ item = LastUpdatedOrderedDict() item['uri'] = rec_props_obj.uri if cannonical is False or 'href' in self.rec_attributes: item['href'] = rec_props_obj.href item['citation uri'] = rec_props_obj.cite_uri item['label'] = rec_props_obj.label item['project label'] = rec_props_obj.project_label if cannonical: item['project uri'] = rec_props_obj.project_uri else: item['project href'] = rec_props_obj.project_href item['context label'] = rec_props_obj.context_label if cannonical: item['context uri'] = rec_props_obj.context_uri else: item['context href'] = rec_props_obj.context_href item['latitude'] = rec_props_obj.latitude item['longitude'] = rec_props_obj.longitude item['early bce/ce'] = rec_props_obj.early_date item['late bce/ce'] = rec_props_obj.late_date item['item category'] = rec_props_obj.category if rec_props_obj.snippet is not False: item['snippet'] = rec_props_obj.snippet if rec_props_obj.thumbnail_scr is not False: item['thumbnail'] = rec_props_obj.thumbnail_scr if rec_props_obj.preview_scr is not False: item['preview'] = rec_props_obj.preview_scr if rec_props_obj.fullfile_scr is not False: item['primary-file'] = rec_props_obj.fullfile_scr item['published'] = rec_props_obj.published item['updated'] = rec_props_obj.updated if isinstance(rec_props_obj.other_attributes, list): for attribute in rec_props_obj.other_attributes: prop_key = attribute['property'] prop_key = rec_props_obj.prevent_attribute_key_collision( item, prop_key) if self.flatten_rec_attributes: if 'value' in attribute: item[prop_key] = attribute['value'] elif 'values_list' in attribute: item[prop_key] = RecordProperties.ATTRIBUTE_DELIM.join( attribute['values_list']) else: item[prop_key] = attribute['values_list'] return item def extract_solr_recs(self, solr_json): """ extracts solr_recs along with some basic metadata from solr_json """ solr_recs = False if isinstance(solr_json, dict): try: self.total_found = solr_json['response']['numFound'] except KeyError: self.total_found = False try: self.rec_start = solr_json['response']['start'] except KeyError: self.rec_start = False try: self.highlighting = solr_json['highlighting'] except KeyError: self.highlighting = False try: solr_recs = solr_json['response']['docs'] except KeyError: solr_recs = False return solr_recs def get_media_thumbs(self, solr_recs): """ gets media thumbnail items """ thumb_results = {} not_media_uuids = [] media_uuids = [] rec_props_obj = RecordProperties(self.response_dict_json) for solr_rec in solr_recs: item = rec_props_obj.get_solr_record_uuid_type(solr_rec) if item is not False: uuid = item['uuid'] if item['item_type'] != 'media': not_media_uuids.append(uuid) else: media_uuids.append(uuid) thumb_results[uuid] = False if len(not_media_uuids) > 0: if self.do_media_thumbs: # only get media_thumbnails if needed rows = self.get_thumbs_for_non_media(not_media_uuids) for row in rows: uuid = row['uuid'] thumb_obj = {} thumb_obj[ 'href'] = self.base_url + '/media/' + row['media_uuid'] thumb_obj[ 'uri'] = settings.CANONICAL_HOST + '/media/' + row[ 'media_uuid'] thumb_obj['scr'] = row['file_uri'] if thumb_results[uuid] is False: thumb_results[uuid] = thumb_obj if len(media_uuids) > 0: thumbs = Mediafile.objects\ .filter(uuid__in=media_uuids, file_type='oc-gen:thumbnail') for thumb in thumbs: uuid = thumb.uuid thumb_obj = {} thumb_obj['href'] = self.base_url + '/media/' + thumb.uuid thumb_obj[ 'uri'] = settings.CANONICAL_HOST + '/media/' + thumb.uuid thumb_obj['scr'] = thumb.file_uri thumb_results[uuid] = thumb_obj return thumb_results def get_all_media_files(self, solr_recs): """ gets media thumbnail items """ media_file_results = {} if self.get_all_media: media_uuids = [] rec_props_obj = RecordProperties(self.response_dict_json) for solr_rec in solr_recs: item = rec_props_obj.get_solr_record_uuid_type(solr_rec) if item is not False: uuid = item['uuid'] if item['item_type'] == 'media': media_uuids.append(uuid) media_file_results[uuid] = False if len(media_uuids) > 0: media_files = Mediafile.objects\ .filter(uuid__in=media_uuids) for media_file in media_files: uuid = media_file.uuid if uuid not in media_file_results: media_file_results[uuid] = {} else: if media_file_results[uuid] is False: media_file_results[uuid] = {} media_file_results[uuid][ media_file.file_type] = media_file.file_uri return media_file_results def get_thumbs_for_non_media(self, uuid_list): q_uuids = self.make_query_uuids(uuid_list) query = ('SELECT ass.uuid AS uuid, m.file_uri AS file_uri, ' 'm.uuid AS media_uuid ' 'FROM oc_assertions AS ass ' 'JOIN oc_mediafiles AS m ON ass.object_uuid = m.uuid ' 'AND m.file_type=\'oc-gen:thumbnail\' ' 'WHERE ass.uuid IN (' + q_uuids + ') ' 'GROUP BY ass.uuid, m.file_uri, m.uuid; ') cursor = connection.cursor() cursor.execute(query) rows = self.dictfetchall(cursor) return rows def make_query_uuids(self, uuid_list): """ makes a string for uuid list query """ uuid_q = [] for uuid in uuid_list: uuid = '\'' + uuid + '\'' uuid_q.append(uuid) return ', '.join(uuid_q) def dictfetchall(self, cursor): """ Return all rows from a cursor as a dict """ columns = [col[0] for col in cursor.description] return [dict(zip(columns, row)) for row in cursor.fetchall()] def get_string_rec_attributes(self, solr_recs): """ gets string record attributes from the database. The solr index does not keep string-fields in memory """ output = {} str_attribs = {} for attribute in self.rec_attributes: entity = self.m_cache.get_entity(attribute) if entity: prop_slug = entity.slug # check to make sure we have the entity data type for linked fields if entity.data_type is False and entity.item_type == 'uri': dtypes = self.s_cache.get_dtypes(entity.uri) if isinstance(dtypes, list): # set te data type and the act-field # print('Found for ' + prop_slug + ' ' + dtypes[0]) entity.data_type = dtypes[0] if entity.data_type == 'xsd:string': str_attribs[attribute] = entity if len(str_attribs) > 0: uuid_list = [] for solr_rec in solr_recs: if 'uuid' in solr_rec: uuid = str(solr_rec['uuid']) uuid_list.append(uuid) output = self.get_string_attributes(uuid_list, str_attribs) return output def get_string_attributes(self, uuid_list, str_attribute_ent_dict): """ Gets string attribute data for a solr dict """ output = {} pred_uuid_list = [] pred_uuid_objs = {} for key, entity in str_attribute_ent_dict.items(): if isinstance(entity.uuid, str): # add string predicate entity uuid to the list pred_uuid_list.append(entity.uuid) pred_uuid_objs[entity.uuid] = { 'rec_attribute': key, 'property': entity.label, 'pred_uuid': entity.uuid, 'slug': entity.slug } if len(pred_uuid_list) > 0 and len(uuid_list) > 0: q_rows = self.get_string_attributes_sql(uuid_list, pred_uuid_list) dict_rows = {} for row in q_rows: # print(str(row)) # the whole "dict row" bullshit is because for some reason # we can't simply append to the output of the uuid = row['uuid'] pred_uuid = row['predicate_uuid'] content = row['content'] if uuid not in dict_rows: dict_rows[uuid] = {} if pred_uuid not in dict_rows[uuid]: dict_rows[uuid][pred_uuid] = [] if isinstance(content, str): dict_rows[uuid][pred_uuid].append(content) # print(str(dict_rows[uuid][pred_uuid])) output = {'pred_ents': pred_uuid_objs, 'data': dict_rows} return output def get_string_attributes_sql(self, uuid_list, pred_uuid_list): """ executes SQL query to get strings for the solr uuids and predicates """ q_uuids = self.make_query_uuids(uuid_list) p_uuids = self.make_query_uuids(pred_uuid_list) query = ( 'SELECT ass.uuid AS uuid, ass.predicate_uuid AS predicate_uuid, ' 's.content AS content ' 'FROM oc_assertions AS ass ' 'JOIN oc_strings AS s ON ass.object_uuid = s.uuid ' 'WHERE ass.uuid IN (' + q_uuids + ') AND ' 'ass.predicate_uuid IN (' + p_uuids + ')' 'ORDER BY ass.uuid, ass.predicate_uuid, s.content; ') cursor = connection.cursor() cursor.execute(query) rows = self.dictfetchall(cursor) return rows