def _make_cache_geospace_obj_dict(self, uuids): """Make a dict of geospace objects keyed by uuid""" m_cache = MemoryCache() uuids_for_qs = [] uuid_geo_dict = {} for uuid in uuids: cache_key = m_cache.make_cache_key(prefix='geospace-obj', identifier=uuid) geo_obj = m_cache.get_cache_object(cache_key) if geo_obj is None: uuids_for_qs.append(uuid) else: uuid_geo_dict[uuid] = geo_obj if not len(uuids_for_qs): # Found them all from the cache! # Return without touching the database. return uuid_geo_dict # Lookup the remaining geospace objects from a # database query. We order by uuid then reverse # of feature_id so that the lowest feature id is the # thing that actually gets cached. geospace_qs = Geospace.objects.filter(uuid__in=uuids_for_qs, ).exclude( ftype__in=['Point', 'point']).order_by('uuid', '-feature_id') for geo_obj in geospace_qs: cache_key = m_cache.make_cache_key(prefix='geospace-obj', identifier=str(geo_obj.uuid)) m_cache.save_cache_object(cache_key, geo_obj) uuid_geo_dict[geo_obj.uuid] = geo_obj return uuid_geo_dict
def _get_cache_contexts_dict(self, uuids): """Make a dictionary that associates uuids to context paths""" m_cache = MemoryCache() uuids_for_qs = [] uuid_context_dict = {} for uuid in uuids: cache_key = m_cache.make_cache_key(prefix='context-path', identifier=uuid) context_path = m_cache.get_cache_object(cache_key) if context_path is None: uuids_for_qs.append(uuid) else: uuid_context_dict[uuid] = context_path if not len(uuids_for_qs): # Found them all from the cache! # Return without touching the database. return uuid_context_dict # Lookup the remaining geospace objects from a # database query. We order by uuid then reverse # of feature_id so that the lowest feature id is the # thing that actually gets cached. subject_qs = Subject.objects.filter(uuid__in=uuids_for_qs, ) for sub_obj in subject_qs: cache_key = m_cache.make_cache_key(prefix='context-path', identifier=str(sub_obj.uuid)) m_cache.save_cache_object(cache_key, sub_obj.context) uuid_context_dict[sub_obj.uuid] = sub_obj.context return uuid_context_dict
def get_project_date_range(self, project_uuid): """ gets a project date range """ mem = MemoryCache() key = mem.make_cache_key('proj-chrono', project_uuid) date_range = mem.get_cache_object(key) if not isinstance(date_range, dict): date_range = self.get_project_date_range_db(project_uuid) mem.save_cache_object(key, date_range) return date_range
def get_project_geo_meta(self, project_uuid): """ gets a geo_meta object for a project """ mem = MemoryCache() key = mem.make_cache_key('proj-geo', project_uuid) geo_meta = mem.get_cache_object(key) if geo_meta is None: geo_meta = self.get_project_geo_meta_db(project_uuid) mem.save_cache_object(key, geo_meta) return geo_meta
def get_all_uuids_related_to_gazetteers(self, all_gaz_annos=None): """ gets ALL subject entities related to gazetteer entities """ mc = MemoryCache() cache_id = mc.make_cache_key('gaz', 'uuids_all_gaz') uuids_all_gaz = mc.get_cache_object(cache_id) if uuids_all_gaz is None: if all_gaz_annos is None: all_gaz_annos = self.get_all_related_to_gazetteers() uuids_all_gaz = { 'subjects': {}, 'documents': {}, 'media': {}, 'projects': {}, 'types': {} } for gaz_anno in all_gaz_annos: hash_id = gaz_anno.hash_id gaz_ent_uri = gaz_anno.object_uri key = gaz_anno.subject_type if hash_id not in uuids_all_gaz[key]: gaz_ref = { 'uuid': gaz_anno.subject, 'item_type': gaz_anno.subject_type, 'gaz_ent_uri': gaz_ent_uri } if key == 'subjects': # get subjects specific information for the gaz_ref gaz_ref = self.subjects_specific_gaz_ref( gaz_anno.subject, gaz_ent_uri) uuids_all_gaz[key][hash_id] = gaz_ref # Gazeteer linked types describe other items that we want to annotate # Look up the items described by a type so we can add to the # gazetteer described items if gaz_anno.subject_type == 'types': rel_asserts = Assertion.objects\ .filter(subject_type__in=self.OC_OA_TARGET_TYPES, object_uuid=gaz_anno.subject) for rel_assert in rel_asserts: key = rel_assert.subject_type if hash_id not in uuids_all_gaz[key]: gaz_ref = { 'uuid': rel_assert.uuid, 'item_type': rel_assert.subject_type, 'gaz_ent_uri': gaz_ent_uri } if key == 'subjects': # get subjects specific information gaz_ref = self.subjects_specific_gaz_ref( rel_assert.uuid, gaz_ent_uri) uuids_all_gaz[key][hash_id] = gaz_ref # save this hard work to the cache mc.save_cache_object(cache_id, uuids_all_gaz) return uuids_all_gaz
def get_used_gazetteer_entities(self): """ gets entitites in gazetteer vocabularies that are actually being used. NOTE! This checks the memnory cache first! """ mc = MemoryCache() cache_id = mc.make_cache_key('gaz', 'used_gazetteer_ents') act_gaz_list = mc.get_cache_object(cache_id) if act_gaz_list is None: # cache was empty, so get this from the database act_gaz_list = self.get_used_gazetteer_entities_db() mc.save_cache_object(cache_id, act_gaz_list) return act_gaz_list
def get_geo_overlays(self): """Gets geo overlays for an item identified by uuid.""" m_cache = MemoryCache() cache_key = m_cache.make_cache_key('geo-layers', self.uuid) geo_overlays = m_cache.get_cache_object(cache_key) if geo_overlays is not None: self.geo_overlays = geo_overlays return self.geo_overlays else: geo_overlays = self.get_geo_overlays_db() m_cache.save_cache_object(cache_key, geo_overlays) return self.geo_overlays
def get_cache_earliest_date(self): """ Gets and caches the earliest date as a date_time object! """ mc = MemoryCache() cache_key = mc.make_cache_key('early_date', 'manifest') early_date = mc.get_cache_object(cache_key) if early_date is None: sum_man = Manifest.objects\ .filter(published__gt='2001-01-01')\ .aggregate(Min('published')) early_date = sum_man['published__min'] mc.save_cache_object(cache_key, early_date) return early_date
def get_cache_earliest_date(self): """ Gets and caches the earliest date as a date_time object! """ mc = MemoryCache() cache_key = mc.make_memory_cache_key('early_date', 'manifest') early_date = mc.get_cache_object(cache_key) if early_date is None: sum_man = Manifest.objects\ .filter(published__gt='2001-01-01')\ .aggregate(Min('published')) early_date = sum_man['published__min'] mc.save_cache_object(cache_key, early_date) return early_date
def get_all_related_to_gazetteers(self): """ gets ALL subject entities related to gazetteer entities """ mc = MemoryCache() cache_id = mc.make_cache_key('gaz', 'all_gaz_annos') all_gaz_annos = mc.get_cache_object(cache_id) if all_gaz_annos is None: subject_types = self.OC_OA_TARGET_TYPES subject_types.append('types') act_gaz_list = self.get_used_gazetteer_entities() all_gaz_annos = LinkAnnotation.objects\ .filter(subject_type__in=subject_types, object_uri__in=act_gaz_list) mc.save_cache_object(cache_id, all_gaz_annos) return all_gaz_annos
def get_jsonldish_parents(self, uuid, add_original=True): """Gets parent projects for a project. Returns a list of dictionary objects similar to JSON-LD expectations This is useful for faceted search """ m_cache = MemoryCache() cache_key = m_cache.make_cache_key( 'proj-par-jsonldish_{}'.format(add_original), uuid ) output = m_cache.get_cache_object(cache_key) if output is None: output = self._db_get_jsonldish_parents( uuid, add_original=add_original ) m_cache.save_cache_object(cache_key, output) return output
def get_containment_parent_slug(slug): '''Takes a slug and returns the slug of its parent. Returns 'root' if a slug has no parent. :param str slug: Slug identifying a subjects item. ''' m_cache = MemoryCache() cache_key = m_cache.make_cache_key('contain-par-slug', slug) parent_slug = m_cache.get_cache_object(cache_key) if parent_slug is None: contain_obj = Containment() # Because it seems to introduce memory errors, turn off # caching for this class instance. contain_obj.use_cache = False parent_slug = contain_obj.get_parent_slug_by_slug(slug) m_cache.save_cache_object(cache_key, parent_slug) if parent_slug: return parent_slug return 'root'
class SearchGenerationCache(): """ methods for using the Reddis cache to streamline making JSON-LD search results """ def __init__(self, cannonical_uris=False): self.m_cache = MemoryCache() def get_dtypes(self, entity_uri): """ returns an entity data type """ cache_key = self.m_cache.make_cache_key('data-types', entity_uri) dtypes = self.m_cache.get_cache_object(cache_key) if dtypes is None: dtypes = self._get_dtypes_db(entity_uri) if dtypes: self.m_cache.save_cache_object(cache_key, dtypes) return dtypes def _get_dtypes_db(self, entity_uri): """ returns an entity data type """ # haven't found it yet, so look in database lequiv = LinkEquivalence() return lequiv.get_data_types_from_object(entity_uri)
class SearchGenerationCache(): """ methods for using the Reddis cache to streamline making JSON-LD search results """ def __init__(self, cannonical_uris = False): self.m_cache = MemoryCache() def get_dtypes(self, entity_uri): """ returns an entity data type """ cache_key = self.m_cache.make_cache_key('data-types', entity_uri) dtypes = self.m_cache.get_cache_object(cache_key) if dtypes is None: dtypes = self._get_dtypes_db(entity_uri) if dtypes: self.m_cache.save_cache_object(cache_key, dtypes) return dtypes def _get_dtypes_db(self, entity_uri): """ returns an entity data type """ # haven't found it yet, so look in database lequiv = LinkEquivalence() return lequiv.get_data_types_from_object(entity_uri)
class ReadProjectContextVocabGraph(): """ Methods to read the project context vocabulary graph """ GLOBAL_VOCAB_GRAPH = [ { '@id': 'oc-pred:link', 'owl:sameAs': 'http://opencontext.org/predicates/oc-3', 'label': 'link', 'slug': 'link', 'oc-gen:predType': 'link', '@type': '@id' }, { '@id': Assertion.PREDICATES_NOTE, 'label': 'Note', 'owl:sameAs': False, 'slug': 'oc-gen-has-note', '@type': 'xsd:string' }, ] # predicates used for equivalence, used to make # inferred assertions REL_PREDICATES_FOR_INFERRENCE = ['skos:closeMatch', 'skos:exactMatch'] REL_MEASUREMENTS = [ 'cidoc-crm:P67_refers_to', 'oc-gen:has-technique', 'rdfs:range' ] ITEM_REL_PREDICATES = [ 'skos:closeMatch', 'skos:exactMatch', 'owl:sameAs', 'skos:related', 'skos:broader', 'dc-terms:references', 'dc-terms:hasVersion', 'http://nomisma.org/ontology#hasTypeSeriesItem' ] # Skip the following predicate keys when looking # for inferred linked data assertions in an observation. LINKDATA_OBS_PREDS_SKIP = [ 'id', 'type', ItemKeys.PREDICATES_OCGEN_SOURCEID, ItemKeys.PREDICATES_OCGEN_OBSTATUS, ItemKeys.PREDICATES_OCGEN_OBSLABEL, ItemKeys.PREDICATES_OCGEN_OBSNOTE, ] def __init__(self, proj_context_json_ld=None): self.m_cache = MemoryCache() self.context = None self.graph = None self.fail_on_missing_entities = False if not isinstance(proj_context_json_ld, dict): return None if '@context' in proj_context_json_ld: self.context = proj_context_json_ld['@context'] if '@graph' in proj_context_json_ld: self.graph = self.GLOBAL_VOCAB_GRAPH + proj_context_json_ld[ '@graph'] else: self.graph = self.GLOBAL_VOCAB_GRAPH logger.info('Read project graph size: {}'.format(len(self.graph))) def lookup_predicate(self, id): """looks up an Open Context predicate by an identifier (slud id, uri, slug, or uuid) """ output = self.lookup_oc_descriptor(id, 'predicates') return output def lookup_type(self, id): """looks up an Open Context type by an identifier (slud id, uri, slug, or uuid) """ output = self.lookup_oc_descriptor(id, 'types') return output def lookup_type_by_type_obj(self, type_obj): """looks up an Open Context type to get more information, including linked data equivalents by looking up the a type from how it is used as the object of a descriptive predicate in an observation """ type_ids = self.get_id_list_for_g_obj(type_obj) for type_id in type_ids: found_type_obj = self.lookup_type(type_id) if isinstance(found_type_obj, dict): return found_type_obj return type_obj def lookup_oc_descriptor(self, id, item_type): """looks up a predicate, or a type by an identifier (slud id, uri, slug, or uuid) """ cache_key = self.m_cache.make_cache_key( 'lookup_oc_descriptor_{}'.format(item_type), id) output = self.m_cache.get_cache_object(cache_key) if (output is None and isinstance(self.graph, list) and isinstance(id, str)): for g_obj in self.graph: id_list = self.get_id_list_for_g_obj(g_obj) if not id in id_list: continue output = g_obj if item_type == 'predicates' and '@type' not in g_obj: output[ '@type'] = self.get_predicate_datatype_for_graph_obj( g_obj) break if output: self.m_cache.save_cache_object(cache_key, output) if self.fail_on_missing_entities and not output: raise RuntimeError('Cannot find {}, item_type: {}'.format( id, item_type)) return output def get_predicate_datatype_for_graph_obj(self, g_obj): """ looks up a predicate data type for a given graph object """ slug_uri = self.get_id_from_g_obj(g_obj) datatype = self.get_predicate_datatype_by_slug_uri(slug_uri) return datatype def get_id_list_for_g_obj(self, g_obj): """gets a list of ids for an object""" id_list = [] id_keys = ['@id', 'id', 'owl:sameAs', 'slug', 'uuid'] if isinstance(g_obj, dict): for id_key in id_keys: if not id_key in g_obj: continue if g_obj[id_key] not in id_list: id_list.append(g_obj[id_key]) return id_list def get_id_from_g_obj(self, g_obj): """ gets the id form a g_obj, either the @id or id varient """ id_variants = ['@id', 'id'] id = None if not isinstance(g_obj, dict): return None for id_variant in id_variants: if id_variant not in g_obj: continue id = g_obj[id_variant] return id def get_predicate_datatype_by_slug_uri(self, slug_uri): """Looks up a predicate's datatype via the predicate slug URI.""" datatype = 'xsd:string' # Default to treating all as a string if (isinstance(self.context, dict) and isinstance(slug_uri, str)): if not slug_uri in self.context: return datatype for type_variant in ['@type', 'type']: if type_variant not in self.context[slug_uri]: continue datatype = self.context[slug_uri][type_variant] return datatype def get_equivalent_objects(self, info_dict): """ Gets equivalent linked data dicts associated with an info_dict. """ equiv_uris = [] equiv_objects = [] for rel_pred in self.REL_PREDICATES_FOR_INFERRENCE: if not rel_pred in info_dict: continue for equiv_obj in info_dict[rel_pred]: equiv_uri = self.get_id_from_g_obj(equiv_obj) if equiv_uri and equiv_uri not in equiv_uris: # Make sure that the equivalent URIs are unique. equiv_uris.append(equiv_uri) equiv_objects.append(equiv_obj) return equiv_objects def infer_assertions_for_item_json_ld(self, json_ld): """Makes a list of inferred assertions from item json ld """ lang_obj = Languages() inferred_assertions = [] if not isinstance(json_ld, dict): return inferred_assertions if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld: return inferred_assertions unique_pred_assertions = LastUpdatedOrderedDict() for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]: # Get the status of the observation, defaulting to 'active'. If # active, then it's OK to infer assertions, otherwise skip the # observation. obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS, 'active') if obs_status != 'active': # Skip this observation. It's there but has a deprecated # status. continue for obs_pred_key, obj_values in obs_dict.items(): if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP: # Skip this obs_pred_key, it is a general # description of the observation, and will # not have any linked assertions to infer. continue obs_pred_info = self.lookup_predicate(obs_pred_key) pred_data_type = self.get_predicate_datatype_for_graph_obj( obs_pred_info) if not obs_pred_info: continue equiv_pred_objs = self.get_equivalent_objects(obs_pred_info) if not equiv_pred_objs: # No linked data equivalence for the obs_pred_key # so continue, skipping the rest. continue # Start with a None assertion. assertion = None # Iterate through all the equivalent predicate objects. for equiv_pred_obj in equiv_pred_objs: equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj) # Inferred assertions will have unique LOD predicates, with # one or more values. The unique_pred_assertions dict makes # sure the LOD predicates are used only once. if not equiv_pred_uri in unique_pred_assertions: assertion = equiv_pred_obj assertion['type'] = pred_data_type assertion['ld_objects'] = LastUpdatedOrderedDict() assertion['oc_objects'] = LastUpdatedOrderedDict() assertion['literals'] = [] unique_pred_assertions[equiv_pred_uri] = assertion assertion = unique_pred_assertions[equiv_pred_uri] if assertion and equiv_pred_uri: # we have a LOD equvalient property if not isinstance(obj_values, list): obj_values = [obj_values] for obj_val in obj_values: literal_val = None if not isinstance(obj_val, dict): # the object of the assertion is not a dict, so it must be # a literal literal_val = obj_val if obj_val not in assertion['literals']: assertion['literals'].append(obj_val) elif 'xsd:string' in obj_val: literal_val = lang_obj.get_all_value_str( obj_val['xsd:string']) if literal_val and literal_val not in assertion[ 'literals']: assertion['literals'].append(literal_val) if literal_val is None: # Add any linked data equivalences by looking for this # type in the graph list obj_val = self.lookup_type_by_type_obj(obj_val) obj_uri = self.get_id_from_g_obj(obj_val) equiv_obj_objs = self.get_equivalent_objects( obj_val) if len(equiv_obj_objs): # We have LD equivalents for the object value for equiv_obj_obj in equiv_obj_objs: equiv_obj_uri = self.get_id_from_g_obj( equiv_obj_obj) if not biological_taxonomy_validation( equiv_pred_uri, equiv_obj_uri): # This object_uri does not belong to this # predicated uri. continue assertion['ld_objects'][ equiv_obj_uri] = equiv_obj_obj elif obj_uri: # We don't have LD equivalents for the object value # add to the oc_objects assertion['oc_objects'][obj_uri] = obj_val unique_pred_assertions[ equiv_pred_uri] = assertion for pred_key, assertion in unique_pred_assertions.items(): inferred_assertions.append(assertion) return inferred_assertions
class QueryMaker(): # main item-types mapped to their slugs to get solr-facet field prefix TYPE_MAPPINGS = {'subjects': 'oc-gen-subjects', 'media': 'oc-gen-media', 'documents': 'oc-gen-documents', 'persons': 'oc-gen-persons', 'projects': 'oc-gen-projects', 'types': 'oc-gen-types', 'predicates': 'oc-gen-predicates'} TYPE_URIS = {'subjects': 'oc-gen:subjects', 'media': 'oc-gen:media', 'documents': 'oc-gen:documents', 'persons': 'oc-gen:persons', 'projects': 'oc-gen:projects', 'types': 'oc-gen:types', 'predicates': 'oc-gen:predicates'} def __init__(self): self.error = False self.histogram_groups = 10 self.mem_cache_obj = MemoryCache() # memory caching object def _get_context_paths(self, spatial_context): ''' Takes a context path and returns an iterator with the list of possible contexts. Parses the list of boolean '||' (OR) and returns a list of contexts. For example: >>> _get_context_paths('Turkey/Domuztepe/I||II||Stray') ['Turkey/Domuztepe/I', 'Turkey/Domuztepe/II', 'Turkey/Domuztepe/Stray'] ''' # Split the context path by '/' and then by '||' context_lists = (value.split('||') for value in spatial_context.split('/')) # Create a list of the various permutations context_tuple_list = list(itertools.product(*context_lists)) # Turn the lists back into URIs return ('/'.join(value) for value in context_tuple_list) def _get_context_depth(self, spatial_context): ''' Takes a context path and returns its depth as an interger. For example, the context '/Turkey/Domuztepe' would have a depth of 2. ''' # Remove a possible trailing slash before calculating the depth return len(spatial_context.rstrip('/').split('/')) def _get_valid_context_slugs(self, contexts): ''' Takes a list of contexts and, for valid contexts, returns a list of slugs ''' entity = Entity() valid_context_slugs = [] context_list = list(contexts) for context in context_list: # Verify that the contexts are valid # find and save the enity to memory # print('check: ' + context) found = self.mem_cache_obj.check_entity_found(context, True) # print('found: ' + str(found)) if found: entity = self.mem_cache_obj.get_entity(context, True) valid_context_slugs.append(entity.slug) return valid_context_slugs def _get_parent_slug(self, slug): ''' Takes a slug and returns the slug of its parent. Returns 'root' if a slug has no parent. ''' cache_key = self.mem_cache_obj.make_memory_cache_key('par-slug', slug) parent_slug = self.mem_cache_obj.get_cache_object(cache_key) if parent_slug is None: contain_obj = Containment() contain_obj.use_cache = False # because it seems to introduce memory errors parent_slug = contain_obj.get_parent_slug_by_slug(slug) self.mem_cache_obj.save_cache_object(cache_key, parent_slug) if parent_slug: return parent_slug else: return 'root' def _prepare_filter_query(self, parent_child_slug): # TODO docstring parent_child_set = parent_child_slug.split('___') return parent_child_set[0].replace('-', '_') + '___context_id_fq:' + \ parent_child_set[1] def expand_hierarchy_options(self, path_param_val, hier_delim='---', or_delim='||'): """ Exapands a hiearchic path string into a list of listed hierachically ordered items. This method also makes a new hiearchic ordered list if there is an 'or_delim'. """ if isinstance(path_param_val, list): inital_path_list = path_param_val else: inital_path_list = [path_param_val] path_list = [] for path_string in inital_path_list: raw_path_list = (value.split(or_delim) for value in path_string.split(hier_delim)) # Create a list of the various permutations path_tuple_list = list(itertools.product(*raw_path_list)) for item in path_tuple_list: path_list.append(list(item)) return path_list def get_solr_field_type(self, data_type, prefix=''): ''' Defines whether our dynamic solr fields names for predicates end with ___pred_id, ___pred_numeric, etc. ''' if data_type in ['@id', 'id', False]: return prefix + 'id' elif data_type in ['xsd:integer', 'xsd:double', 'xsd:boolean']: return prefix + 'numeric' elif data_type == 'xsd:string': return prefix + 'string' elif data_type == 'xsd:date': return prefix + 'date' else: raise Exception("Error: Unknown predicate type") def make_prop_solr_field_parts(self, entity): """ Makes a solr field for a property """ output = {} output['prefix'] = entity.slug.replace('-', '_') output['suffix'] = self.get_solr_field_type(entity.data_type) return output def process_proj(self, proj_path): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] project_path_lists = self.expand_hierarchy_options(proj_path) for proj_path_list in project_path_lists: i = 0 path_list_len = len(proj_path_list) fq_field = SolrDocument.ROOT_PROJECT_SOLR fq_path_terms = [] for proj_slug in proj_path_list: found = self.mem_cache_obj.check_entity_found(proj_slug, False) if found: entity = self.mem_cache_obj.get_entity(proj_slug, False) # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now proj_slug = entity.slug fq_path_term = fq_field + ':' + proj_slug + '*' else: fq_path_term = fq_field + ':' + proj_slug fq_path_terms.append(fq_path_term) fq_field = proj_slug.replace('-', '_') + '___project_id' i += 1 if i >= path_list_len and fq_field not in query_dict['facet.field']: query_dict['facet.field'].append(fq_field) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_ld_object(self, objects): # TODO docstring query_dict = {'fq': []} fq_terms = [] if not isinstance(objects, list): objects = [objects] for raw_obj in objects: if '||' in raw_obj: or_objects = raw_obj.split('||') else: or_objects = [raw_obj] fq_or_terms = [] for obj in or_objects: # find and save the entity to memory found = self.mem_cache_obj.check_entity_found(obj, False) if found: entity = self.mem_cache_obj.get_entity(obj, False) fq_term = 'object_uri:' + self.escape_solr_arg(entity.uri) fq_term += ' OR text:"' + self.escape_solr_arg(entity.uri) + '"' else: fq_term = 'object_uri:' + obj fq_or_terms.append(fq_term) fq_all_ors = ' OR '.join(fq_or_terms) fq_all_ors = '(' + fq_all_ors + ')' fq_terms.append(fq_all_ors) fq_final = ' AND '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_dc_term(self, dc_param, dc_terms, add_facet=False): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] if dc_param in DCterms.DC_META_FIELDS: fq_field = DCterms.DC_META_FIELDS[dc_param] if fq_field not in query_dict['facet.field'] and add_facet: query_dict['facet.field'].append(fq_field) add_to_fq = False for raw_dc_term in dc_terms: if '||' in raw_dc_term: use_dc_terms = raw_dc_term.split('||') else: use_dc_terms = [raw_dc_term] fq_path_terms = [] for dc_term in use_dc_terms: if len(dc_term) > 0: add_to_fq = True # check if entity exists, and or store in memory found = self.mem_cache_obj.check_entity_found(dc_term, False) if found: # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now entity = self.mem_cache_obj.get_entity(dc_term, False) fq_path_term = fq_field + '_fq:' + entity.slug if dc_param == 'dc-temporal' \ and entity.entity_type == 'vocabulary' \ and 'periodo' in entity.slug: # it's a temporal vocabulary from periodo # so search for specific periods contained in # the vocabulary fq_path_term = '(' + fq_path_term +\ ' OR ' + fq_path_term + '*)' else: if dc_term[-1] != '*': dc_term += '*' fq_path_term = fq_field + ':' + dc_term fq_path_terms.append(fq_path_term) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' if add_to_fq: query_dict['fq'].append(fq_final) return query_dict def get_related_slug_field_prefix(self, slug): """ gets the field prefix for a related property if it is present in the slug, then return the solr_field prefix otherwise return a '' string """ field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX prefix_len = len(field_prefix) slug_start = slug[:prefix_len] if slug_start == field_prefix: return field_prefix else: return '' def clean_related_slug(self, slug): """ removes the field_prefix for related slugs """ field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX prefix_len = len(field_prefix) slug_start = slug[:prefix_len] if slug_start == field_prefix: slug = slug[prefix_len:] return slug def correct_solr_prefix_for_fq(self, solr_f_prefix, act_field_fq): """ makes sure the solr prefix is on the fq if needed """ if solr_f_prefix != '': if solr_f_prefix not in act_field_fq: act_field_fq = solr_f_prefix + act_field_fq return act_field_fq def process_prop(self, props): """ processes 'prop' (property) parameters property parameters are tricky because they can come in hierarchies that's why there's some complexity to this """ # is the property for the item itself, or for a related item? query_dict = {'fq': [], 'facet.field': [], 'stats.field': [], 'prequery-stats': [], 'facet.range': [], 'hl-queries': [], 'ranges': {}} fq_terms = [] prop_path_lists = self.expand_hierarchy_options(props) for prop_path_list in prop_path_lists: i = 0 path_list_len = len(prop_path_list) fq_path_terms = [] act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR act_field_data_type = 'id' last_field_label = False # needed for full text highlighting predicate_solr_slug = False for prop_slug in prop_path_list: field_prefix = self.get_related_slug_field_prefix(prop_slug) solr_f_prefix = field_prefix.replace('-', '_') db_prop_slug = self.clean_related_slug(prop_slug) l_prop_entity = False pred_prop_entity = False require_id_field = False if act_field_data_type == 'id': # check entity exists, and save to memory found = self.mem_cache_obj.check_entity_found(db_prop_slug, False) if found: entity = self.mem_cache_obj.get_entity(db_prop_slug, False) last_field_label = entity.label prop_slug = field_prefix + entity.slug if entity.item_type == 'uri' and 'oc-gen' not in db_prop_slug: if entity.entity_type == 'property': pred_prop_entity = True predicate_solr_slug = prop_slug.replace('-', '_') l_prop_entity = True children = self.mem_cache_obj.get_entity_children(entity.uri) if len(children) > 0: # ok, this field has children. require it # to be treated as an ID field require_id_field = True else: if entity.item_type == 'predicates': pred_prop_entity = True predicate_solr_slug = prop_slug.replace('-', '_') children = self.mem_cache_obj.get_entity_children(entity.uri) if len(children) > 0: # ok, this field has children. require it # to be treated as an ID field require_id_field = True if i == 0: if 'oc-gen' in db_prop_slug: # for open context categories / types act_field_fq = self.get_parent_item_type_facet_field(entity.uri) lr = LinkRecursion() parents = lr.get_jsonldish_entity_parents(entity.uri) if len(parents) > 1: try: p_slug = parents[-2]['slug'] act_field_fq = p_slug.replace('-', '_') + '___pred_id' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) except: pass elif entity.item_type == 'uri': act_field_fq = SolrDocument.ROOT_LINK_DATA_SOLR elif entity.item_type == 'predicates': temp_field_fq = self.get_parent_item_type_facet_field(entity.uri) parents = self.mem_cache_obj.get_jsonldish_entity_parents(entity.uri) if len(parents) > 1: try: p_slug = parents[-2]['slug'] temp_field_fq = p_slug.replace('-', '_') + '___pred_id' except: print('Predicate Parent exception: '+ str(parents)) temp_field_fq = False if temp_field_fq is not False: act_field_fq = temp_field_fq else: act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR else: act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR # --------------------------------------------------- # THIS PART BUILDS THE FACET-QUERY # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now fq_field = act_field_fq + '_fq' if path_list_len >= 2 and act_field_data_type == 'id': # could be an object deeper in the hierarchy, so allow the obj_all version fq_path_term = '(' + fq_field + ':' + prop_slug fq_path_term += ' OR obj_all___' + fq_field + ':' + prop_slug + ')' else: fq_path_term = fq_field + ':' + prop_slug fq_path_terms.append(fq_path_term) #--------------------------------------------------- # #--------------------------------------------------- # THIS PART PREPARES FOR LOOPING OR FINAL FACET-FIELDS # # print('pred-solr-slug: ' + predicate_solr_slug) field_parts = self.make_prop_solr_field_parts(entity) act_field_data_type = field_parts['suffix'] if require_id_field: act_field_data_type = 'id' field_parts['suffix'] = 'id' # check if the last or penultimate field has # a different data-type (for linked-data) if i >= (path_list_len - 2) \ and l_prop_entity: dtypes = self.mem_cache_obj.get_dtypes(entity.uri) if isinstance(dtypes, list): # set te data type and the act-field found = self.mem_cache_obj.check_entity_found(db_prop_slug, False) if found: entity = self.mem_cache_obj.get_entity(db_prop_slug, False) entity.date_type = dtypes[0] # store for later use self.mem_cache_obj.entities[db_prop_slug] = entity # store for later use act_field_data_type = self.get_solr_field_type(dtypes[0]) if predicate_solr_slug is False or pred_prop_entity: act_field_fq = field_parts['prefix'] + '___pred_' + field_parts['suffix'] act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) # get a facet on this field if act_field_data_type != 'string': # adds a prefix for related properties ffield = solr_f_prefix + field_parts['prefix'] + '___pred_' + field_parts['suffix'] if ffield not in query_dict['facet.field'] \ and i >= (path_list_len - 1): query_dict['facet.field'].append(ffield) else: if act_field_data_type == 'id': act_field_fq = 'obj_all___' + predicate_solr_slug \ + '___pred_' + field_parts['suffix'] # get a facet on this field if predicate_solr_slug != field_parts['prefix']: # the predicate_solr_slug is not the # prefix of the current field part, meaning # the field_parts[prefix] is the type, and # we want facets for the predicate -> type ffield = field_parts['prefix'] \ + '___' \ + predicate_solr_slug \ + '___pred_' + field_parts['suffix'] else: # get facets for the predicate ffield = field_parts['prefix'] \ + '___pred_' \ + field_parts['suffix'] # adds a prefix, in case of a related property ffield = solr_f_prefix + ffield if ffield not in query_dict['facet.field'] \ and i >= (path_list_len - 1): query_dict['facet.field'].append(ffield) else: act_field_fq = predicate_solr_slug + '___pred_' + field_parts['suffix'] # ------------------------------------------- if act_field_data_type == 'numeric': # print('Numeric field: ' + act_field) act_field_fq = field_parts['prefix'] + '___pred_numeric' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) query_dict = self.add_math_facet_ranges(query_dict, act_field_fq, entity) elif act_field_data_type == 'date': # print('Date field: ' + act_field) act_field_fq = field_parts['prefix'] + '___pred_date' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) query_dict = self.add_date_facet_ranges(query_dict, act_field_fq, entity) # print('Current data type (' + str(i) + '): ' + act_field_data_type) # print('Current field (' + str(i) + '): ' + act_field_fq) i += 1 elif act_field_data_type == 'string': # case for a text search # last_field_label = False # turn off using the field label for highlighting string_terms = self.prep_string_search_term(prop_slug) for escaped_term in string_terms: search_term = act_field_fq + ':' + escaped_term if last_field_label is False: query_dict['hl-queries'].append(escaped_term) else: query_dict['hl-queries'].append(last_field_label + ' ' + escaped_term) fq_path_terms.append(search_term) elif act_field_data_type == 'numeric': # numeric search. assume it's well formed solr numeric request search_term = act_field_fq + ':' + prop_slug fq_path_terms.append(search_term) # now limit the numeric ranges from query to the range facets query_dict = self.add_math_facet_ranges(query_dict, act_field_fq, False, prop_slug) elif act_field_data_type == 'date': # date search. assume it's well formed solr request search_term = act_field_fq + ':' + prop_slug fq_path_terms.append(search_term) # now limit the date ranges from query to the range facets query_dict = self.add_date_facet_ranges(query_dict, act_field_fq, False, prop_slug) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def add_math_facet_ranges(self, query_dict, act_field, entity=False, solr_query=False): """ this does some math for facet ranges for numeric fields """ ok = False groups = self.histogram_groups fstart = 'f.' + act_field + '.facet.range.start' fend = 'f.' + act_field + '.facet.range.end' fgap = 'f.' + act_field + '.facet.range.gap' findex = 'f.' + act_field + '.facet.sort' fother = 'f.' + act_field + '.facet.range.other' finclude = 'f.' + act_field + '.facet.range.include' if entity is not False: # this is a field with no value limits # we need to do a stats-prequery first query_dict['prequery-stats'].append(act_field) else: if solr_query is not False: vals = [] # get the numbers out q_nums_strs = re.findall(r'[-+]?\d*\.\d+|\d+', solr_query) for q_num_str in q_nums_strs: vals.append(float(q_num_str)) vals.sort() if len(vals) > 1: ok = True min_val = vals[0] max_val = vals[-1] if ok: if act_field not in query_dict['stats.field']: query_dict['stats.field'].append(act_field) if act_field not in query_dict['facet.range']: query_dict['facet.range'].append(act_field) query_dict['ranges'][fother] = 'all' query_dict['ranges'][finclude] = 'all' query_dict['ranges'][fstart] = min_val query_dict['ranges'][fend] = max_val query_dict['ranges'][fgap] = (max_val - min_val) / groups query_dict['ranges'][findex] = 'index' # sort by index, not by count return query_dict def add_date_facet_ranges(self, query_dict, act_field, entity=False, solr_query=False): """ this does some math for facet ranges for numeric fields """ ok = False groups = 4 fstart = 'f.' + act_field + '.facet.range.start' fend = 'f.' + act_field + '.facet.range.end' fgap = 'f.' + act_field + '.facet.range.gap' findex = 'f.' + act_field + '.facet.sort' fother = 'f.' + act_field + '.facet.range.other' finclude = 'f.' + act_field + '.facet.range.include' if entity is not False: # this is a field with no value limits # we need to do a stats-prequery first query_dict['prequery-stats'].append(act_field) else: if solr_query is not False: q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}[T:]\d{2}:\d{2}:\d{2}', solr_query) if len(q_dt_strs) < 2: # try a less strict regular expression to get dates q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}', solr_query) if len(q_dt_strs) >= 2: ok = True vals = [] for q_dt_str in q_dt_strs: vals.append(q_dt_str) vals.sort() min_val = vals[0] max_val = vals[1] if ok: if act_field not in query_dict['stats.field']: query_dict['stats.field'].append(act_field) if act_field not in query_dict['facet.range']: query_dict['facet.range'].append(act_field) query_dict['ranges'][fother] = 'all' query_dict['ranges'][finclude] = 'all' query_dict['ranges'][fstart] = self.convert_date_to_solr_date(min_val) query_dict['ranges'][fend] = self.convert_date_to_solr_date(max_val) query_dict['ranges'][fgap] = self.get_date_difference_for_solr(min_val, max_val, groups) query_dict['ranges'][findex] = 'index' # sort by index, not by count return query_dict def get_date_difference_for_solr(self, min_date, max_date, groups): """ Gets a solr date difference from two values """ min_dt = self.date_convert(min_date) max_dt = self.date_convert(max_date) dif_dt = (max_dt - min_dt) / groups if dif_dt.days >= 366: solr_val = int(round((dif_dt.days / 365.25), 0)) solr_dif = '+' + str(solr_val) + 'YEAR' elif dif_dt.days >= 31: solr_val = int(round((dif_dt.days / 30), 0)) solr_dif = '+' + str(solr_val) + 'MONTH' elif dif_dt.days >= 1: solr_val = int(round(dif_dt.days, 0)) solr_dif = '+' + str(solr_val) + 'DAY' elif (dif_dt.seconds // 3600) >= 1: solr_val = int(round((dif_dt.seconds // 3600), 0)) solr_dif = '+' + str(solr_val) + 'HOUR' elif ((dif_dt.seconds % 3600) // 60) >= 1: solr_val = int(round(((dif_dt.seconds % 3600) // 60), 0)) solr_dif = '+' + str(solr_val) + 'MINUTE' elif dif_dt.seconds >= 1: solr_val = int(round(dif_dt.seconds, 0)) solr_dif = '+' + str(solr_val) + 'SECOND' else: solr_dif = '+1YEAR' return solr_dif def add_solr_gap_to_date(self, date_val, solr_gap): """ adds a solr gap to a date_val """ solr_val = re.sub(r'[^\d.]', r'', solr_gap) solr_val = int(float(solr_val)) dt = self.date_convert(date_val) if 'YEAR' in solr_gap: dt = dt + datetime.timedelta(days=int(round((solr_val * 365.25), 0))) elif 'MONTH' in solr_gap: dt = dt + datetime.timedelta(days=(solr_val * 30)) elif 'DAY' in solr_gap: dt = dt + datetime.timedelta(days=solr_val) elif 'HOUR' in solr_gap: dt = dt + datetime.timedelta(hours=solr_val) elif 'MINUTE' in solr_gap: dt = dt + datetime.timedelta(minutes=solr_val) elif 'SECOND' in solr_gap: dt = dt + datetime.timedelta(seconds=solr_val) else: dt = dt return dt def convert_date_to_solr_date(self, date_val): """ Conversts a string for a date into a Solr formated datetime string """ dt = self.date_convert(date_val) return dt.strftime('%Y-%m-%dT%H:%M:%SZ') def make_human_readable_date(self, date_val): """ Converts a date value into something easier to read """ dt = self.date_convert(date_val) check_date = dt.strftime('%Y-%m-%d') check_dt = self.date_convert(date_val) if check_dt == dt: return check_date else: return dt.strftime('%Y-%m-%d:%H:%M:%S') def date_convert(self, date_val): """ converts to a python datetime if not already so """ if isinstance(date_val, str): date_val = date_val.replace('Z', '') dt = datetime.datetime.strptime(date_val, '%Y-%m-%dT%H:%M:%S') else: dt = date_val return dt def get_parent_item_type_facet_field(self, category_uri): """ Gets the parent facet field for a given category_uri. This assumes the category_uri is an entity that exists in the database. """ output = False; parents = LinkRecursion().get_jsonldish_entity_parents(category_uri) for par in parents: if par['slug'] in self.TYPE_MAPPINGS.values(): # the parent exists in the Type Mappings output = par['slug'].replace('-', '_') + '___pred_id' break return output def get_parent_entity_facet_field(self, entity_uri): """ Gets the parent facet field for a given category_uri. This assumes the category_uri is an entity that exists in the database. """ output = False; parents = LinkRecursion().get_jsonldish_entity_parents(entity_uri) if isinstance(parents, list): if len(parents) > 1: # get the penultimate field output = parents[-2]['slug'].replace('-', '_') + '___pred_id' return output def process_item_type(self, raw_item_type): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] item_type_lists = self.expand_hierarchy_options(raw_item_type) for item_type_list in item_type_lists: i = 0 path_list_len = len(item_type_list) fq_path_terms = [] item_type = item_type_list[0] # no hiearchy in this field, just the type fq_term = 'item_type:' + item_type fq_terms.append(fq_term) if item_type in self.TYPE_MAPPINGS: act_field = self.TYPE_MAPPINGS[item_type].replace('-', '_') + '___pred_id' query_dict['facet.field'].append(act_field) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_id(self, identifier): # check for identifier query_dict = {'fq': [], 'facet.field': []} fq_terms = [] escape_id = self.escape_solr_arg(identifier) fq_terms.append('persistent_uri:' + escape_id) # now make a DOI URI in case this is just a naked DOI doi_uri = self.escape_solr_arg('http://dx.doi.org/' + identifier) fq_terms.append('persistent_uri:' + doi_uri) # now make an ARK URI in case this is just a naked ARK ark_uri = self.escape_solr_arg('http://n2t.net/' + identifier) fq_terms.append('persistent_uri:' + ark_uri) # now make an ORCID URI in case this is just a naked ORCID orcid_uri = self.escape_solr_arg('http://orcid.org/' + identifier) fq_terms.append('persistent_uri:' + orcid_uri) fq_terms.append('uuid:' + escape_id) tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True) if tcheck is not False: uuid = tcheck['uuid'] fq_terms.append('uuid:' + uuid) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) # print(fq_final) return query_dict def process_form_use_life_chrono(self, raw_form_use_life_chrono): # creates facet query for form-use-life chronological tiles # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} fq_terms = [] query_dict['facet.field'].append('form_use_life_chrono_tile') if '||' in raw_form_use_life_chrono: chrono_paths = raw_form_use_life_chrono.split('||') else: chrono_paths = [raw_form_use_life_chrono] for chrono_path in chrono_paths: i = 0 if len(chrono_path) < 30: chrono_path += '*' fq_term = 'form_use_life_chrono_tile:' + chrono_path fq_terms.append(fq_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_form_date_chrono(self, form_use_life_date, date_type): # creates facet query for form-use-life dates # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} if date_type == 'start': qterm = '[' + str(form_use_life_date) + ' TO *]' fquery = 'form_use_life_chrono_earliest: ' + qterm else: qterm = '[* TO ' + str(form_use_life_date) + ']' fquery = 'form_use_life_chrono_latest: ' + qterm query_dict['fq'].append(fquery) return query_dict def process_discovery_geo(self, raw_disc_geo): # creates facet query for discovery geotiles # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} fq_terms = [] query_dict['facet.field'].append('discovery_geotile') if '||' in raw_disc_geo: disc_geo_paths = raw_disc_geo.split('||') else: disc_geo_paths = [raw_disc_geo] for disc_path in disc_geo_paths: i = 0 if len(disc_path) < 20: disc_path += '*' fq_term = 'discovery_geotile:' + disc_path fq_terms.append(fq_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_discovery_bbox(self, raw_disc_bbox): # creates facet query for bounding box searches # supports or {'||') queries query_dict = {'fq': []} fq_terms = [] if '||' in raw_disc_bbox: bbox_list = raw_disc_bbox.split('||') else: bbox_list = [raw_disc_bbox] for bbox in bbox_list: if ',' in bbox: # comma seperated list of coordinates bbox_coors = bbox.split(',') bbox_valid = self.validate_bbox_coordiantes(bbox_coors) if bbox_valid: # valid bounding box, now make a solr-query # not how solr expacts latitude / longitude order, which # is the revserse of geojson! fq_term = 'discovery_geolocation:' fq_term += '[' + str(bbox_coors[1]) + ',' + str(bbox_coors[0]) fq_term += ' TO ' + str(bbox_coors[3]) + ',' + str(bbox_coors[2]) fq_term += ']' fq_terms.append(fq_term) if len(fq_terms) > 0: fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def validate_bbox_coordiantes(self, bbox_coors): """ validates a set of bounding box coordinates """ is_valid = False if len(bbox_coors) == 4: lower_left_valid = self.validate_geo_lon_lat(bbox_coors[0], bbox_coors[1]) top_right_valid = self.validate_geo_lon_lat(bbox_coors[2], bbox_coors[3]) # print('ok: ' + str(lower_left_valid) + ' ' + str(top_right_valid)) if lower_left_valid and top_right_valid: if float(bbox_coors[0]) < float(bbox_coors[2]) and\ float(bbox_coors[1]) < float(bbox_coors[3]): is_valid = True return is_valid def validate_geo_lon_lat(self, lon, lat): """ checks to see if a lon, lat pair are valid. Note the GeoJSON ordering of the coordinates """ is_valid = False lon_valid = self.validate_geo_coordinate(lon, 'lon') lat_valid = self.validate_geo_coordinate(lat, 'lat') if lon_valid and lat_valid: is_valid = True return is_valid def validate_geo_coordinate(self, coordinate, coord_type): """ validates a geo-spatial coordinate """ is_valid = False try: fl_coord = float(coordinate) except ValueError: fl_coord = False if fl_coord is not False: if 'lat' in coord_type: if fl_coord <= 90 and\ fl_coord >= -90: is_valid = True elif 'lon' in coord_type: if fl_coord <= 180 and\ fl_coord >= -180: is_valid = True return is_valid def make_solr_value_from_entity(self, entity, value_type='id'): """ makes a solr value as indexed in SolrDocument see _concat_solr_string_value """ id_part = entity.uri if 'http://opencontext.org' in entity.uri: if '/vocabularies/' not in entity.uri: id_part = entity.uri.split('http://opencontext.org')[1] return entity.slug + '___' + value_type + '___' + \ id_part + '___' + entity.label return output def _process_spatial_context(self, spatial_context=None): # TODO docstring context = {} if spatial_context: context_paths = self._get_context_paths(spatial_context) context_slugs = self._get_valid_context_slugs(context_paths) # print('Context slugs: ' + str(context_slugs)) # If we cannot find a valid context, raise a 404 if not context_slugs: raise Http404 # Solr 'fq' parameters parent_child_slugs = [] # Solr 'facet.field' parameters facet_field = [] for slug in context_slugs: # fq parameters parent_child_slugs.append(self._get_parent_slug(slug) + '___' + slug) # facet.field parameters facet_field.append(slug.replace('-', '_') + '___context_id') # First, handle the most likely scenario of a single context if len(parent_child_slugs) == 1: context['fq'] = self._prepare_filter_query(parent_child_slugs[0]) # Otherwise, combine multiple contexts into an OR filter else: fq_string = ' OR '.join( (self._prepare_filter_query(slug_set) for slug_set in parent_child_slugs) ) context['fq'] = '(' + fq_string + ')' context['facet.field'] = facet_field # No spatial context provided else: context['fq'] = None context['facet.field'] = ['root___context_id'] return context def prep_string_search_term(self, raw_term): """ prepares a string search returns a list of search terms for AND queries """ if '"' in raw_term: nq_term = raw_term.replace('"', ' ') # get rid of quotes in the search term quoted_list = re.findall(r"\"(.*?)\"", raw_term) terms = [] terms.append(self.escape_solr_arg(nq_term)) for quote_item in quoted_list: quote_item = self.escape_solr_arg(quote_item) # escape characters quote_item = '"' + quote_item + '"' # put quotes back around it terms.append(quote_item) else: terms = [] terms.append(self.escape_solr_arg(raw_term)) return terms def escaped_seq(self, term): """ Yield the next string based on the next character (either this char or escaped version """ escaperules = {'+': r'\+', '-': r'\-', '&': r'\&', '|': r'\|', '!': r'\!', '(': r'\(', ')': r'\)', '{': r'\{', '}': r'\}', '[': r'\[', ']': r'\]', '^': r'\^', '~': r'\~', '*': r'\*', '?': r'\?', ':': r'\:', '"': r'\"', ';': r'\;', ' ': r'\ '} for char in term: if char in escaperules.keys(): yield escaperules[char] else: yield char def escape_solr_arg(self, term): """ Apply escaping to the passed in query terms escaping special characters like : , etc""" term = term.replace('\\', r'\\') # escape \ first return "".join([next_str for next_str in self.escaped_seq(term)])
class QueryMaker(): # main item-types mapped to their slugs to get solr-facet field prefix TYPE_MAPPINGS = {'subjects': 'oc-gen-subjects', 'media': 'oc-gen-media', 'documents': 'oc-gen-documents', 'persons': 'oc-gen-persons', 'projects': 'oc-gen-projects', 'types': 'oc-gen-types', 'predicates': 'oc-gen-predicates', 'tables': 'oc-gen-tables'} TYPE_URIS = {'subjects': 'oc-gen:subjects', 'media': 'oc-gen:media', 'documents': 'oc-gen:documents', 'persons': 'oc-gen:persons', 'projects': 'oc-gen:projects', 'types': 'oc-gen:types', 'predicates': 'oc-gen:predicates', 'tables': 'oc-gen:tables'} def __init__(self): self.error = False self.histogram_groups = 10 self.m_cache = MemoryCache() # memory caching object self.s_cache = SearchGenerationCache() # supplemental caching object, specific for searching def _get_context_paths(self, spatial_context): ''' Takes a context path and returns an iterator with the list of possible contexts. Parses the list of boolean '||' (OR) and returns a list of contexts. For example: >>> _get_context_paths('Turkey/Domuztepe/I||II||Stray') ['Turkey/Domuztepe/I', 'Turkey/Domuztepe/II', 'Turkey/Domuztepe/Stray'] ''' # Split the context path by '/' and then by '||' context_lists = (value.split('||') for value in spatial_context.split('/')) # Create a list of the various permutations context_tuple_list = list(itertools.product(*context_lists)) # Turn the lists back into URIs return ('/'.join(value) for value in context_tuple_list) def _get_context_depth(self, spatial_context): ''' Takes a context path and returns its depth as an interger. For example, the context '/Turkey/Domuztepe' would have a depth of 2. ''' # Remove a possible trailing slash before calculating the depth return len(spatial_context.rstrip('/').split('/')) def _get_valid_context_slugs(self, contexts): ''' Takes a list of contexts and, for valid contexts, returns a list of slugs ''' valid_context_slugs = [] context_list = list(contexts) for context in context_list: # Verify that the contexts are valid # find and save the enity to memory context = context.replace('+', ' ') context = context.replace('%20', ' ') # print('check: ' + context) entity = self.m_cache.get_entity_by_context(context) if entity: valid_context_slugs.append(entity.slug) # print('context-slugs: ' + str(valid_context_slugs)) return valid_context_slugs def _get_parent_slug(self, slug): ''' Takes a slug and returns the slug of its parent. Returns 'root' if a slug has no parent. ''' cache_key = self.m_cache.make_cache_key('par-slug', slug) parent_slug = self.m_cache.get_cache_object(cache_key) if parent_slug is None: contain_obj = Containment() contain_obj.use_cache = False # because it seems to introduce memory errors parent_slug = contain_obj.get_parent_slug_by_slug(slug) self.m_cache.save_cache_object(cache_key, parent_slug) if parent_slug: return parent_slug else: return 'root' def _prepare_filter_query(self, parent_child_slug): # TODO docstring parent_child_set = parent_child_slug.split('___') return parent_child_set[0].replace('-', '_') + '___context_id_fq:' + \ parent_child_set[1] def expand_hierarchy_options(self, path_param_val, hier_delim='---', or_delim='||'): """ Exapands a hiearchic path string into a list of listed hierachically ordered items. This method also makes a new hiearchic ordered list if there is an 'or_delim'. """ if isinstance(path_param_val, list): inital_path_list = path_param_val else: inital_path_list = [path_param_val] path_list = [] for path_string in inital_path_list: raw_path_list = (value.split(or_delim) for value in path_string.split(hier_delim)) # Create a list of the various permutations path_tuple_list = list(itertools.product(*raw_path_list)) for item in path_tuple_list: path_list.append(list(item)) return path_list def get_solr_field_type(self, data_type, prefix=''): ''' Defines whether our dynamic solr fields names for predicates end with ___pred_id, ___pred_numeric, etc. ''' if data_type in ['@id', 'id', False]: return prefix + 'id' elif data_type in ['xsd:integer', 'xsd:double', 'xsd:boolean']: return prefix + 'numeric' elif data_type == 'xsd:string': return prefix + 'string' elif data_type == 'xsd:date': return prefix + 'date' else: raise Exception("Error: Unknown predicate type") def make_prop_solr_field_parts(self, entity): """ Makes a solr field for a property """ output = {} output['prefix'] = entity.slug.replace('-', '_') output['suffix'] = self.get_solr_field_type(entity.data_type) return output def process_proj(self, proj_path): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] project_path_lists = self.expand_hierarchy_options(proj_path) for proj_path_list in project_path_lists: i = 0 path_list_len = len(proj_path_list) fq_field = SolrDocument.ROOT_PROJECT_SOLR fq_path_terms = [] for proj_slug in proj_path_list: entity = self.m_cache.get_entity(proj_slug) if entity: # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now proj_slug = entity.slug if len(proj_slug) > 56: proj_slug = proj_slug[0:56] fq_path_term = fq_field + ':' + proj_slug + '*' if entity.par_proj_man_obj is not False and \ fq_field == SolrDocument.ROOT_PROJECT_SOLR: # this entity has a parent object, so make sure to look for it as a child of # that parent project alt_fq_field = entity.par_proj_man_obj.slug.replace('-', '_') + '___project_id' alt_fq_term = alt_fq_field + ':' + proj_slug + '*' fq_path_term = ' (' + fq_path_term + ' OR ' + alt_fq_term + ' ) ' else: fq_path_term = fq_field + ':' + proj_slug fq_path_terms.append(fq_path_term) fq_field = proj_slug.replace('-', '_') + '___project_id' i += 1 if i >= path_list_len and fq_field not in query_dict['facet.field']: query_dict['facet.field'].append(fq_field) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_ld_object(self, objects): # TODO docstring query_dict = {'fq': []} fq_terms = [] if not isinstance(objects, list): objects = [objects] for raw_obj in objects: if '||' in raw_obj: or_objects = raw_obj.split('||') else: or_objects = [raw_obj] fq_or_terms = [] for obj in or_objects: # find and save the entity to memory entity = self.m_cache.get_entity(obj) if entity: fq_term = 'object_uri:' + self.escape_solr_arg(entity.uri) fq_term += ' OR text:"' + self.escape_solr_arg(entity.uri) + '"' else: fq_term = 'object_uri:' + obj fq_or_terms.append(fq_term) fq_all_ors = ' OR '.join(fq_or_terms) fq_all_ors = '(' + fq_all_ors + ')' fq_terms.append(fq_all_ors) fq_final = ' AND '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_dc_term(self, dc_param, dc_terms, add_facet=False): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] if dc_param in DCterms.DC_META_FIELDS: fq_field = DCterms.DC_META_FIELDS[dc_param] if fq_field not in query_dict['facet.field'] and add_facet: query_dict['facet.field'].append(fq_field) add_to_fq = False for raw_dc_term in dc_terms: if '||' in raw_dc_term: use_dc_terms = raw_dc_term.split('||') else: use_dc_terms = [raw_dc_term] fq_path_terms = [] for dc_term in use_dc_terms: if len(dc_term) > 0: add_to_fq = True # check if entity exists, and or store in memory entity = self.m_cache.get_entity(dc_term) if entity: # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now fq_path_term = '(' + fq_field + '_fq:' + entity.slug + ')' fq_path_term += ' OR (' + fq_field + ':' + entity.slug + '*)' fq_path_term += ' OR (obj_all___' + fq_field + ':' + entity.slug + '___*)' fq_path_term += '(' + fq_path_term + ')' # print('vocab: ' + str(entity.vocabulary)) if entity.vocabulary == entity.label: par_slug_part = entity.slug.replace('-', '_') child_facet_field = par_slug_part + '___' + fq_field print('adding: ' + child_facet_field) query_dict['facet.field'].append(child_facet_field) if dc_param == 'dc-temporal' \ and entity.entity_type == 'vocabulary' \ and 'periodo' in entity.slug: # it's a temporal vocabulary from periodo # so search for specific periods contained in # the vocabulary fq_path_term = '(' + fq_path_term +\ ' OR ' + fq_path_term + '*)' else: if dc_term[-1] != '*': dc_term += '*' fq_path_term = fq_field + ':' + dc_term fq_path_terms.append(fq_path_term) final_path_term = ' OR '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' AND '.join(fq_terms) fq_final = '(' + fq_final + ')' if add_to_fq: query_dict['fq'].append(fq_final) return query_dict def get_related_slug_field_prefix(self, slug): """ gets the field prefix for a related property if it is present in the slug, then return the solr_field prefix otherwise return a '' string """ field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX prefix_len = len(field_prefix) slug_start = slug[:prefix_len] if slug_start == field_prefix: return field_prefix else: return '' def clean_related_slug(self, slug): """ removes the field_prefix for related slugs """ field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX prefix_len = len(field_prefix) slug_start = slug[:prefix_len] if slug_start == field_prefix: slug = slug[prefix_len:] return slug def correct_solr_prefix_for_fq(self, solr_f_prefix, act_field_fq): """ makes sure the solr prefix is on the fq if needed """ if solr_f_prefix != '': if solr_f_prefix not in act_field_fq: act_field_fq = solr_f_prefix + act_field_fq return act_field_fq def process_prop(self, props): """ processes 'prop' (property) parameters property parameters are tricky because they can come in hierarchies that's why there's some complexity to this """ # is the property for the item itself, or for a related item? query_dict = {'fq': [], 'facet.field': [], 'stats.field': [], 'prequery-stats': [], 'facet.range': [], 'hl-queries': [], 'ranges': {}} fq_terms = [] prop_path_lists = self.expand_hierarchy_options(props) for prop_path_list in prop_path_lists: i = 0 path_list_len = len(prop_path_list) fq_path_terms = [] act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR act_field_data_type = 'id' last_field_label = False # needed for full text highlighting predicate_solr_slug = False for prop_slug in prop_path_list: field_prefix = self.get_related_slug_field_prefix(prop_slug) solr_f_prefix = field_prefix.replace('-', '_') db_prop_slug = self.clean_related_slug(prop_slug) l_prop_entity = False pred_prop_entity = False require_id_field = False if act_field_data_type == 'id': # check entity exists, and save to memory entity = self.m_cache.get_entity(db_prop_slug) if entity: last_field_label = entity.label prop_slug = field_prefix + entity.slug if entity.item_type == 'uri' and not db_prop_slug.startswith('oc-gen'): if entity.entity_type == 'property': pred_prop_entity = True predicate_solr_slug = prop_slug.replace('-', '_') l_prop_entity = True children = LinkRecursion().get_entity_children(entity.uri) if len(children) > 0: # ok, this field has children. require it # to be treated as an ID field require_id_field = True else: if entity.item_type == 'predicates': pred_prop_entity = True predicate_solr_slug = prop_slug.replace('-', '_') children = LinkRecursion().get_entity_children(entity.uri) if len(children) > 0: # ok, this field has children. require it # to be treated as an ID field require_id_field = True if i == 0: if db_prop_slug.startswith('oc-gen'): # for open context categories / types act_field_fq = self.get_parent_item_type_facet_field(entity.uri) lr = LinkRecursion() parents = lr.get_jsonldish_entity_parents(entity.uri) if len(parents) > 1: try: p_slug = parents[-2]['slug'] act_field_fq = p_slug.replace('-', '_') + '___pred_id' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) except: pass print('Predicate Parent exception: '+ str(parents)) elif entity.item_type == 'uri': act_field_fq = SolrDocument.ROOT_LINK_DATA_SOLR elif entity.item_type == 'predicates': temp_field_fq = self.get_parent_item_type_facet_field(entity.uri) lr = LinkRecursion() parents = lr.get_jsonldish_entity_parents(entity.uri) if len(parents) > 1: try: p_slug = parents[-2]['slug'] temp_field_fq = p_slug.replace('-', '_') + '___pred_id' except: print('Predicate Parent exception: '+ str(parents)) temp_field_fq = False if temp_field_fq is not False: act_field_fq = temp_field_fq else: act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR else: act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR # --------------------------------------------------- # THIS PART BUILDS THE FACET-QUERY # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity) # the below is a bit of a hack. We should have a query field # as with ___pred_ to query just the slug. But this works for now fq_field = act_field_fq + '_fq' if path_list_len >= 2 and act_field_data_type == 'id': # could be an object deeper in the hierarchy, so allow the obj_all version fq_path_term = '(' + fq_field + ':' + prop_slug fq_path_term += ' OR obj_all___' + fq_field + ':' + prop_slug + ')' else: fq_path_term = fq_field + ':' + prop_slug fq_path_terms.append(fq_path_term) #--------------------------------------------------- # #--------------------------------------------------- # THIS PART PREPARES FOR LOOPING OR FINAL FACET-FIELDS # # print('pred-solr-slug: ' + predicate_solr_slug) field_parts = self.make_prop_solr_field_parts(entity) act_field_data_type = field_parts['suffix'] if require_id_field: act_field_data_type = 'id' field_parts['suffix'] = 'id' # check if the last or penultimate field has # a different data-type (for linked-data) if i >= (path_list_len - 2) \ and l_prop_entity: dtypes = self.s_cache.get_dtypes(entity.uri) if isinstance(dtypes, list): # set the data type and the act-field act_field_data_type = self.get_solr_field_type(dtypes[0]) if not predicate_solr_slug or pred_prop_entity: act_field_fq = field_parts['prefix'] + '___pred_' + field_parts['suffix'] act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) # get a facet on this field if act_field_data_type != 'string': # adds a prefix for related properties ffield = solr_f_prefix + field_parts['prefix'] + '___pred_' + field_parts['suffix'] if ffield not in query_dict['facet.field'] and \ i >= (path_list_len - 1): query_dict['facet.field'].append(ffield) else: if act_field_data_type == 'id': act_field_fq = 'obj_all___' + predicate_solr_slug \ + '___pred_' + field_parts['suffix'] # get a facet on this field if predicate_solr_slug != field_parts['prefix']: # the predicate_solr_slug is not the # prefix of the current field part, meaning # the field_parts[prefix] is the type, and # we want facets for the predicate -> type ffield = field_parts['prefix'] \ + '___' \ + predicate_solr_slug \ + '___pred_' + field_parts['suffix'] else: # get facets for the predicate ffield = field_parts['prefix'] \ + '___pred_' \ + field_parts['suffix'] # adds a prefix, in case of a related property ffield = solr_f_prefix + ffield if ffield not in query_dict['facet.field'] \ and i >= (path_list_len - 1): query_dict['facet.field'].append(ffield) else: act_field_fq = predicate_solr_slug + '___pred_' + field_parts['suffix'] # ------------------------------------------- if act_field_data_type == 'numeric': # print('Numeric field: ' + act_field) act_field_fq = field_parts['prefix'] + '___pred_numeric' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) query_dict = self.add_math_facet_ranges(query_dict, act_field_fq, entity) elif act_field_data_type == 'date': # print('Date field: ' + act_field) act_field_fq = field_parts['prefix'] + '___pred_date' act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq) query_dict = self.add_date_facet_ranges(query_dict, act_field_fq, entity) # print('Current data type (' + str(i) + '): ' + act_field_data_type) # print('Current field (' + str(i) + '): ' + act_field_fq) i += 1 elif act_field_data_type == 'string': # case for a text search # last_field_label = False # turn off using the field label for highlighting string_terms = self.prep_string_search_term(prop_slug) for escaped_term in string_terms: search_term = act_field_fq + ':' + escaped_term if last_field_label is False: query_dict['hl-queries'].append(escaped_term) else: query_dict['hl-queries'].append(last_field_label + ' ' + escaped_term) fq_path_terms.append(search_term) elif act_field_data_type == 'numeric': # numeric search. assume it's well formed solr numeric request search_term = act_field_fq + ':' + prop_slug fq_path_terms.append(search_term) # now limit the numeric ranges from query to the range facets query_dict = self.add_math_facet_ranges(query_dict, act_field_fq, False, prop_slug) elif act_field_data_type == 'date': # date search. assume it's well formed solr request search_term = act_field_fq + ':' + prop_slug fq_path_terms.append(search_term) # now limit the date ranges from query to the range facets query_dict = self.add_date_facet_ranges(query_dict, act_field_fq, False, prop_slug) final_path_term = ' AND '.join(fq_path_terms) final_path_term = '(' + final_path_term + ')' fq_terms.append(final_path_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def add_math_facet_ranges(self, query_dict, act_field, entity=False, solr_query=False): """ this does some math for facet ranges for numeric fields """ ok = False groups = self.histogram_groups fstart = 'f.' + act_field + '.facet.range.start' fend = 'f.' + act_field + '.facet.range.end' fgap = 'f.' + act_field + '.facet.range.gap' findex = 'f.' + act_field + '.facet.sort' fother = 'f.' + act_field + '.facet.range.other' finclude = 'f.' + act_field + '.facet.range.include' if entity is not False: # this is a field with no value limits # we need to do a stats-prequery first query_dict['prequery-stats'].append(act_field) else: if solr_query is not False: vals = [] # get the numbers out q_nums_strs = re.findall(r'[-+]?\d*\.\d+|\d+', solr_query) for q_num_str in q_nums_strs: vals.append(float(q_num_str)) vals.sort() if len(vals) > 1: ok = True min_val = vals[0] max_val = vals[-1] if ok: if act_field not in query_dict['stats.field']: query_dict['stats.field'].append(act_field) if act_field not in query_dict['facet.range']: query_dict['facet.range'].append(act_field) query_dict['ranges'][fother] = 'all' query_dict['ranges'][finclude] = 'all' query_dict['ranges'][fstart] = min_val query_dict['ranges'][fend] = max_val query_dict['ranges'][fgap] = (max_val - min_val) / groups query_dict['ranges'][findex] = 'index' # sort by index, not by count return query_dict def add_date_facet_ranges(self, query_dict, act_field, entity=False, solr_query=False): """ this does some math for facet ranges for numeric fields """ ok = False groups = 4 fstart = 'f.' + act_field + '.facet.range.start' fend = 'f.' + act_field + '.facet.range.end' fgap = 'f.' + act_field + '.facet.range.gap' findex = 'f.' + act_field + '.facet.sort' fother = 'f.' + act_field + '.facet.range.other' finclude = 'f.' + act_field + '.facet.range.include' if entity is not False: # this is a field with no value limits # we need to do a stats-prequery first query_dict['prequery-stats'].append(act_field) else: if solr_query is not False: q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}[T:]\d{2}:\d{2}:\d{2}', solr_query) if len(q_dt_strs) < 2: # try a less strict regular expression to get dates q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}', solr_query) if len(q_dt_strs) >= 2: ok = True vals = [] for q_dt_str in q_dt_strs: vals.append(q_dt_str) vals.sort() min_val = vals[0] max_val = vals[1] if ok: if act_field not in query_dict['stats.field']: query_dict['stats.field'].append(act_field) if act_field not in query_dict['facet.range']: query_dict['facet.range'].append(act_field) query_dict['ranges'][fother] = 'all' query_dict['ranges'][finclude] = 'all' query_dict['ranges'][fstart] = self.convert_date_to_solr_date(min_val) query_dict['ranges'][fend] = self.convert_date_to_solr_date(max_val) query_dict['ranges'][fgap] = self.get_date_difference_for_solr(min_val, max_val, groups) query_dict['ranges'][findex] = 'index' # sort by index, not by count return query_dict def get_date_difference_for_solr(self, min_date, max_date, groups): """ Gets a solr date difference from two values """ min_dt = self.date_convert(min_date) max_dt = self.date_convert(max_date) dif_dt = (max_dt - min_dt) / groups if dif_dt.days >= 366: solr_val = int(round((dif_dt.days / 365.25), 0)) solr_dif = '+' + str(solr_val) + 'YEAR' elif dif_dt.days >= 31: solr_val = int(round((dif_dt.days / 30), 0)) solr_dif = '+' + str(solr_val) + 'MONTH' elif dif_dt.days >= 1: solr_val = int(round(dif_dt.days, 0)) solr_dif = '+' + str(solr_val) + 'DAY' elif (dif_dt.seconds // 3600) >= 1: solr_val = int(round((dif_dt.seconds // 3600), 0)) solr_dif = '+' + str(solr_val) + 'HOUR' elif ((dif_dt.seconds % 3600) // 60) >= 1: solr_val = int(round(((dif_dt.seconds % 3600) // 60), 0)) solr_dif = '+' + str(solr_val) + 'MINUTE' elif dif_dt.seconds >= 1: solr_val = int(round(dif_dt.seconds, 0)) solr_dif = '+' + str(solr_val) + 'SECOND' else: solr_dif = '+1YEAR' return solr_dif def add_solr_gap_to_date(self, date_val, solr_gap): """ adds a solr gap to a date_val """ solr_val = re.sub(r'[^\d.]', r'', solr_gap) solr_val = int(float(solr_val)) dt = self.date_convert(date_val) if 'YEAR' in solr_gap: dt = dt + datetime.timedelta(days=int(round((solr_val * 365.25), 0))) elif 'MONTH' in solr_gap: dt = dt + datetime.timedelta(days=(solr_val * 30)) elif 'DAY' in solr_gap: dt = dt + datetime.timedelta(days=solr_val) elif 'HOUR' in solr_gap: dt = dt + datetime.timedelta(hours=solr_val) elif 'MINUTE' in solr_gap: dt = dt + datetime.timedelta(minutes=solr_val) elif 'SECOND' in solr_gap: dt = dt + datetime.timedelta(seconds=solr_val) else: dt = dt return dt def convert_date_to_solr_date(self, date_val): """ Conversts a string for a date into a Solr formated datetime string """ dt = self.date_convert(date_val) return dt.strftime('%Y-%m-%dT%H:%M:%SZ') def make_human_readable_date(self, date_val): """ Converts a date value into something easier to read """ dt = self.date_convert(date_val) check_date = dt.strftime('%Y-%m-%d') check_dt = self.date_convert(date_val) if check_dt == dt: return check_date else: return dt.strftime('%Y-%m-%d:%H:%M:%S') def date_convert(self, date_val): """ converts to a python datetime if not already so """ if isinstance(date_val, str): date_val = date_val.replace('Z', '') dt = datetime.datetime.strptime(date_val, '%Y-%m-%dT%H:%M:%S') else: dt = date_val return dt def get_parent_item_type_facet_field(self, category_uri): """ Gets the parent facet field for a given category_uri. This assumes the category_uri is an entity that exists in the database. """ output = False; lr = LinkRecursion() parents = lr.get_jsonldish_entity_parents(category_uri) for par in parents: if par['slug'] in self.TYPE_MAPPINGS.values(): # the parent exists in the Type Mappings output = par['slug'].replace('-', '_') + '___pred_id' break return output def get_parent_entity_facet_field(self, entity_uri): """ Gets the parent facet field for a given category_uri. This assumes the category_uri is an entity that exists in the database. """ output = False; lr = LinkRecursion() parents = lr.get_jsonldish_entity_parents(entity_uri) if isinstance(parents, list): if len(parents) > 1: # get the penultimate field output = parents[-2]['slug'].replace('-', '_') + '___pred_id' return output def process_item_type(self, raw_item_type): # TODO docstring query_dict = {'fq': [], 'facet.field': []} fq_terms = [] item_type_lists = self.expand_hierarchy_options(raw_item_type) for item_type_list in item_type_lists: i = 0 path_list_len = len(item_type_list) fq_path_terms = [] item_type = item_type_list[0] # no hiearchy in this field, just the type fq_term = 'item_type:' + item_type fq_terms.append(fq_term) if item_type in self.TYPE_MAPPINGS: act_field = self.TYPE_MAPPINGS[item_type].replace('-', '_') + '___pred_id' query_dict['facet.field'].append(act_field) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_id(self, identifier): # check for identifier query_dict = {'fq': [], 'facet.field': []} fq_terms = [] id_list = [identifier] id_list = self.make_http_https_options(id_list) for act_id in id_list: escape_id = self.escape_solr_arg(act_id) fq_terms.append('persistent_uri:' + escape_id) fq_terms.append('uuid:' + escape_id) # now make URIs in case we have a naked identifier prefix_removes = [ 'doi:', 'orcid:', 'http://dx.doi.org/', 'https://dx.doi.org/', 'http://doi.org/', 'https://doi.org/' ] for prefix in prefix_removes: # strip ID prefixes, case insensitive re_gone = re.compile(re.escape(prefix), re.IGNORECASE) identifier = re_gone.sub('', identifier) uris = [ 'http://dx.doi.org/' + identifier, # DOI (old) 'http://doi.org/' + identifier, # DOI (new) 'http://n2t.net/' + identifier, # ARK (CDL / Merritt) 'http://orcid.org/' + identifier # Orcid (people) ] # now make https http varients of the URIs uris = self.make_http_https_options(uris) for uri in uris: # now make a DOI URI in case this is just a naked DOI escaped_uri = self.escape_solr_arg(uri) fq_terms.append('persistent_uri:' + escaped_uri) tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True) if tcheck is not False: uuid = tcheck['uuid'] fq_terms.append('uuid:' + uuid) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) # print(fq_final) return query_dict def process_form_use_life_chrono(self, raw_form_use_life_chrono): # creates facet query for form-use-life chronological tiles # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} fq_terms = [] query_dict['facet.field'].append('form_use_life_chrono_tile') if '||' in raw_form_use_life_chrono: chrono_paths = raw_form_use_life_chrono.split('||') else: chrono_paths = [raw_form_use_life_chrono] for chrono_path in chrono_paths: i = 0 if len(chrono_path) < 30: chrono_path += '*' fq_term = 'form_use_life_chrono_tile:' + chrono_path fq_terms.append(fq_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_form_date_chrono(self, form_use_life_date, date_type): # creates facet query for form-use-life dates # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} if date_type == 'start': qterm = '[' + str(form_use_life_date) + ' TO *]' fquery = 'form_use_life_chrono_earliest: ' + qterm else: qterm = '[* TO ' + str(form_use_life_date) + ']' fquery = 'form_use_life_chrono_latest: ' + qterm query_dict['fq'].append(fquery) return query_dict def process_discovery_geo(self, raw_disc_geo): # creates facet query for discovery geotiles # supports or {'||') queries in the path also query_dict = {'fq': [], 'facet.field': []} fq_terms = [] query_dict['facet.field'].append('discovery_geotile') if '||' in raw_disc_geo: disc_geo_paths = raw_disc_geo.split('||') else: disc_geo_paths = [raw_disc_geo] for disc_path in disc_geo_paths: i = 0 if len(disc_path) < 20: disc_path += '*' fq_term = 'discovery_geotile:' + disc_path fq_terms.append(fq_term) fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def process_discovery_bbox(self, raw_disc_bbox): # creates facet query for bounding box searches # supports or {'||') queries query_dict = {'fq': []} fq_terms = [] if '||' in raw_disc_bbox: bbox_list = raw_disc_bbox.split('||') else: bbox_list = [raw_disc_bbox] for bbox in bbox_list: if ',' in bbox: # comma seperated list of coordinates bbox_coors = bbox.split(',') bbox_valid = self.validate_bbox_coordiantes(bbox_coors) if bbox_valid: # valid bounding box, now make a solr-query # not how solr expacts latitude / longitude order, which # is the revserse of geojson! fq_term = 'discovery_geolocation:' fq_term += '[' + str(bbox_coors[1]) + ',' + str(bbox_coors[0]) fq_term += ' TO ' + str(bbox_coors[3]) + ',' + str(bbox_coors[2]) fq_term += ']' fq_terms.append(fq_term) if len(fq_terms) > 0: fq_final = ' OR '.join(fq_terms) fq_final = '(' + fq_final + ')' query_dict['fq'].append(fq_final) return query_dict def validate_bbox_coordiantes(self, bbox_coors): """ validates a set of bounding box coordinates """ is_valid = False if len(bbox_coors) == 4: lower_left_valid = self.validate_geo_lon_lat(bbox_coors[0], bbox_coors[1]) top_right_valid = self.validate_geo_lon_lat(bbox_coors[2], bbox_coors[3]) # print('ok: ' + str(lower_left_valid) + ' ' + str(top_right_valid)) if lower_left_valid and top_right_valid: if float(bbox_coors[0]) < float(bbox_coors[2]) and\ float(bbox_coors[1]) < float(bbox_coors[3]): is_valid = True return is_valid def validate_geo_lon_lat(self, lon, lat): """ checks to see if a lon, lat pair are valid. Note the GeoJSON ordering of the coordinates """ is_valid = False lon_valid = self.validate_geo_coordinate(lon, 'lon') lat_valid = self.validate_geo_coordinate(lat, 'lat') if lon_valid and lat_valid: is_valid = True return is_valid def validate_geo_coordinate(self, coordinate, coord_type): """ validates a geo-spatial coordinate """ is_valid = False try: fl_coord = float(coordinate) except ValueError: fl_coord = False if fl_coord is not False: if 'lat' in coord_type: if fl_coord <= 90 and\ fl_coord >= -90: is_valid = True elif 'lon' in coord_type: if fl_coord <= 180 and\ fl_coord >= -180: is_valid = True return is_valid def make_solr_value_from_entity(self, entity, value_type='id'): """ makes a solr value as indexed in SolrDocument see _concat_solr_string_value """ id_part = entity.uri if 'http://opencontext.org' in entity.uri: if '/vocabularies/' not in entity.uri: id_part = entity.uri.split('http://opencontext.org')[1] return entity.slug + '___' + value_type + '___' + \ id_part + '___' + entity.label return output def _process_spatial_context(self, spatial_context=None): # TODO docstring context = {} if spatial_context: context_paths = self._get_context_paths(spatial_context) context_slugs = self._get_valid_context_slugs(context_paths) # print('Context slugs: ' + str(context_slugs)) # If we cannot find a valid context, raise a 404 if not context_slugs: raise Http404 # Solr 'fq' parameters parent_child_slugs = [] # Solr 'facet.field' parameters facet_field = [] for slug in context_slugs: # fq parameters parent_child_slugs.append(self._get_parent_slug(slug) + '___' + slug) # facet.field parameters facet_field.append(slug.replace('-', '_') + '___context_id') # First, handle the most likely scenario of a single context if len(parent_child_slugs) == 1: context['fq'] = self._prepare_filter_query(parent_child_slugs[0]) # Otherwise, combine multiple contexts into an OR filter else: fq_string = ' OR '.join( (self._prepare_filter_query(slug_set) for slug_set in parent_child_slugs) ) context['fq'] = '(' + fq_string + ')' context['facet.field'] = facet_field # No spatial context provided else: context['fq'] = None context['facet.field'] = ['root___context_id'] return context def prep_string_search_term(self, raw_term): """ prepares a string search returns a list of search terms for AND queries """ if '"' in raw_term: nq_term = raw_term.replace('"', ' ') # get rid of quotes in the search term quoted_list = re.findall(r"\"(.*?)\"", raw_term) terms = [] terms.append(self.escape_solr_arg(nq_term)) for quote_item in quoted_list: quote_item = self.escape_solr_arg(quote_item) # escape characters quote_item = '"' + quote_item + '"' # put quotes back around it terms.append(quote_item) else: terms = [] terms.append(self.escape_solr_arg(raw_term)) return terms def make_http_https_options(self, terms): """ checks a list of terms for http:// or https:// strings, if those exist, then add the alternative to the list """ output_terms = terms if isinstance(terms, list): output_terms = [] for term in terms: output_terms.append(term) if isinstance(term, str): if 'http://' in term: new_term = term.replace('http://', 'https://') elif 'https://' in term: new_term = term.replace('https://', 'http://') else: new_term = None if new_term is not None: output_terms.append(new_term) else: output_terms = terms return output_terms def escaped_seq(self, term): """ Yield the next string based on the next character (either this char or escaped version """ escaperules = {'+': r'\+', '-': r'\-', '&': r'\&', '|': r'\|', '!': r'\!', '(': r'\(', ')': r'\)', '{': r'\{', '}': r'\}', '[': r'\[', ']': r'\]', '^': r'\^', '~': r'\~', '*': r'\*', '?': r'\?', ':': r'\:', '"': r'\"', ';': r'\;', ' ': r'\ '} for char in term: if char in escaperules.keys(): yield escaperules[char] else: yield char def escape_solr_arg(self, term): """ Apply escaping to the passed in query terms escaping special characters like : , etc""" term = term.replace('\\', r'\\') # escape \ first return "".join([next_str for next_str in self.escaped_seq(term)])
class LinkRecursion(): """ Does recursive look ups on link annotations, especially to find hierarchies from opencontext_py.apps.ldata.linkannotations.recursion import LinkRecursion lr = LinkRecursion() lr.get_jsonldish_entity_parents('oc-gen:cat-bio-subj-ecofact') lr = LinkRecursion() lr.get_jsonldish_entity_parents('oc-gen:cat-arch-element') lr = LinkRecursion() lr.get_jsonldish_entity_parents('http://eol.org/pages/7680') lr = LinkRecursion() lr.get_entity_children('http://eol.org/pages/4077', True) """ def __init__(self): self.m_cache = MemoryCache() self.parent_entities = None self.child_entities = None # cache prefix for the json-ldish-parents self.jsonldish_p_prefix = 'json-ldish-parents-{}' # cache prefix for list of parents self.p_prefix = 'lr-parents' # cache prefix for children of an item self.children_prefix = 'lr-children-{}' # cache prefix for full tree of child items self.child_tree_prefix = 'lr-child-tree-{}' def get_jsonldish_entity_parents(self, identifier, add_original=True): """ Gets parent concepts for a given URI or UUID identified entity returns a list of dictionary objects similar to JSON-LD expectations This is useful for faceted search If add_original is true, add the original UUID for the entity that's the childmost item, at the bottom of the hierarchy """ cache_key = self.m_cache.make_cache_key( self.jsonldish_p_prefix.format(str(add_original)), identifier ) obj = self.m_cache.get_cache_object(cache_key) if obj is not None: return obj # We don't have it cached, so get from the database. obj = self._get_jsonldish_entity_parents_db( identifier, add_original ) if obj: self.m_cache.save_cache_object(cache_key, obj) return obj def _get_jsonldish_entity_parents_db(self, identifier, add_original=True): """ Gets parent concepts for a given URI or UUID identified entity returns a list of dictionary objects similar to JSON-LD expectations This is useful for faceted search If add_original is true, add the original UUID for the entity that's the childmost item, at the bottom of the hierarchy """ output = False if add_original: # add the original identifer to the list of parents, at lowest rank raw_parents = ( [identifier] + self.get_entity_parents(identifier, [], 0) ) else: raw_parents = self.get_entity_parents( identifier, [], 0 ) if not len(raw_parents): # No parents. Returns false. return output # Make the output. # reverse the order of the list, to make top most concept # first output = [] for par_id in raw_parents[::-1]: # print('par_id is: ' + par_id) ent = self.m_cache.get_entity(par_id) if not ent: continue p_item = LastUpdatedOrderedDict() p_item['id'] = ent.uri p_item['slug'] = ent.slug p_item['label'] = ent.label if ent.data_type is not False: p_item['type'] = ent.data_type else: p_item['type'] = '@id' p_item['ld_object_ok'] = ent.ld_object_ok output.append(p_item) return output def get_entity_parents(self, identifier, parent_list=None, loop_count=0): """ Gets parent concepts for a given URI or UUID identified entity """ if not parent_list: parent_list = [] loop_count += 1 parent_id = self._get_parent_id(identifier) # print('ID: {} has parent: {}'.format(identifier, parent_id)) if parent_id: if parent_id not in parent_list: parent_list.append(parent_id) # print('Parent list is: ' + str(parent_list)) if loop_count <= 50: parent_list = self.get_entity_parents(parent_id, parent_list, loop_count) else: # all done, save the parents self.parent_entities = parent_list return parent_list def _get_parent_id(self, identifier): """Get the parent id for the current identifier, or from the cache.""" cache_key = self.m_cache.make_cache_key(self.p_prefix, identifier) obj = self.m_cache.get_cache_object(cache_key) if obj is not None: return obj else: obj = self._get_parent_id_db(identifier) if obj: self.m_cache.save_cache_object(cache_key, obj) return obj def _get_parent_id_db(self, identifier): """Get the parent id for the current identifier """ parent_id = None lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) # print('identifiers: {}'.format(identifiers)) p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ preds_for_superobjs = lequiv.get_identifier_list_variants(p_for_superobjs) p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ preds_for_subobjs = lequiv.get_identifier_list_variants(p_for_subobjs) try: # look for superior items in the objects of the assertion # sorting by sort so we can privelage a certain hierarchy path superobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=preds_for_superobjs)\ .exclude(object_uri__in=identifiers)\ .order_by('sort', 'object_uri')[:1] if len(superobjs_anno) < 1: superobjs_anno = False except LinkAnnotation.DoesNotExist: superobjs_anno = False if superobjs_anno: parent_id = superobjs_anno[0].object_uri # print('Subject {} is child of {}'.format(identifiers, parent_id)) oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if oc_uuid: parent_id = oc_uuid try: """ Now look for superior entities in the subject, not the object sorting by sort so we can privelage a certain hierarchy path """ supersubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=preds_for_subobjs)\ .exclude(subject__in=identifiers)\ .order_by('sort', 'subject')[:1] if len(supersubj_anno) < 1: supersubj_anno = False except LinkAnnotation.DoesNotExist: supersubj_anno = False if supersubj_anno: parent_id = supersubj_anno[0].subject # print('Subject {} is parent of {}'.format(parent_id, identifiers)) oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id) if oc_uuid: parent_id = oc_uuid return parent_id def get_entity_children(self, identifier, recursive=True): cache_key = self.m_cache.make_cache_key(self.children_prefix.format(str(recursive)), identifier) tree_cache_key = self.m_cache.make_cache_key(self.child_tree_prefix.format(str(recursive)), identifier) obj = self.m_cache.get_cache_object(cache_key) tree_obj = self.m_cache.get_cache_object(tree_cache_key) if obj is not None and tree_obj is not None: # print('Hit child cache on {}'.format(identifier)) self.child_entities = tree_obj # the fill tree of child entities return obj else: obj = self._get_entity_children_db(identifier, recursive) if obj: # print('Hit child DB on {}'.format(identifier)) self.m_cache.save_cache_object(cache_key, obj) self.m_cache.save_cache_object(tree_cache_key, self.child_entities) return obj def _get_entity_children_db(self, identifier, recursive=True): """ Gets child concepts for a given URI or UUID identified entity """ if not self.child_entities: self.child_entities = LastUpdatedOrderedDict() if identifier in self.child_entities and recursive: output = self.child_entities[identifier] else: act_children = [] p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ lequiv = LinkEquivalence() identifiers = lequiv.get_identifier_list_variants(identifier) try: # look for child items in the objects of the assertion subobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers, predicate_uri__in=p_for_subobjs) if(len(subobjs_anno) < 1): subobjs_anno = False except LinkAnnotation.DoesNotExist: subobjs_anno = False if subobjs_anno is not False: for sub_obj in subobjs_anno: child_id = sub_obj.object_uri act_children.append(child_id) try: """ Now look for subordinate entities in the subject, not the object """ subsubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers, predicate_uri__in=p_for_superobjs) if len(subsubj_anno) < 1: subsubj_anno = False except LinkAnnotation.DoesNotExist: subsubj_anno = False if subsubj_anno is not False: for sub_sub in subsubj_anno: child_id = sub_sub.subject act_children.append(child_id) if len(act_children) > 0: identifier_children = [] for child_id in act_children: if child_id.count('/') > 1: oc_uuid = URImanagement.get_uuid_from_oc_uri(child_id) if oc_uuid: child_id = oc_uuid identifier_children.append(child_id) # recursively get the children of the child if recursive: self.get_entity_children(child_id, recursive) # same the list of children of the current identified item if identifier not in self.child_entities: self.child_entities[identifier] = identifier_children else: # save a False for the current identified item. it has no children if identifier not in self.child_entities: self.child_entities[identifier] = [] output = self.child_entities[identifier] return output def get_pred_top_rank_types(self, predicate_uuid): """ gets the top ranked (not a subordinate) of any other type for a predicate """ types = False try: pred_obj = Predicate.objects.get(uuid=predicate_uuid) except Predicate.DoesNotExist: pred_obj = False if pred_obj is not False: # print('found: ' + predicate_uuid) if pred_obj.data_type == 'id': types = [] id_list = [] pred_types = OCtype.objects\ .filter(predicate_uuid=predicate_uuid) for p_type in pred_types: type_pars = self.get_jsonldish_entity_parents(p_type.uuid) self.parent_entities = [] self.loop_count = 0 if type_pars[0]['id'] not in id_list: # so the top parent is only listed once id_list.append(type_pars[0]['id']) types.append(type_pars[0]) return types def get_entity(self, identifier): """ Gets an entity either from the cache or from database lookups. This is a wrapper for the MemoryCache().get_entity function. """ return self.m_cache.get_entity(identifier)