Exemplo n.º 1
0
class SolrUUIDs():
    """ methods to make get UUIDs from a solr
        search result JSON document,

        also makes URIs
    """

    def __init__(self, response_dict_json=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.uuids = []
        self.uris = []
        self.mem_cache_obj = MemoryCache()  # memory caching object
        self.response_dict_json = response_dict_json
        self.highlighting = False
        # make values to these fields "flat" not a list
        self.flatten_rec_fields = True
        self.total_found = False
        self.rec_start = False
        self.min_date = False
        self.max_date = False
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.do_media_thumbs = True  # get thumbnails for records
        self.get_all_media = False  # get links to all media files for an item

    def make_uuids_from_solr(self, solr_json):
        """ makes geojson-ld point records from a solr response """
        #first do lots of checks to make sure the solr-json is OK
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = solr_rec['uuid']
                    self.uuids.append(uuid)
        return self.uuids

    def make_uris_from_solr(self, solr_json, uris_only=True):
        """ processes the solr_json to
             make GeoJSON records
        """
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            if uris_only:
                self.do_media_thumbs = False
            if self.get_all_media:
                self.do_media_thumbs = False
            if 'thumbnail' in self.rec_attributes:
                self.do_media_thumbs = True
            thumbnail_data = self.get_media_thumbs(solr_recs)
            media_file_data = self.get_all_media_files(solr_recs)
            string_attrib_data = self.get_string_rec_attributes(solr_recs)
            for solr_rec in solr_recs:
                rec_props_obj = RecordProperties(self.response_dict_json)
                rec_props_obj.mem_cache_obj = self.mem_cache_obj
                rec_props_obj.min_date = self.min_date
                rec_props_obj.max_date = self.max_date
                rec_props_obj.highlighting = self.highlighting
                rec_props_obj.flatten_rec_attributes = self.flatten_rec_attributes
                rec_props_obj.rec_attributes = self.rec_attributes
                rec_props_obj.thumbnail_data = thumbnail_data
                rec_props_obj.media_file_data = media_file_data
                rec_props_obj.string_attrib_data = string_attrib_data
                item_ok = rec_props_obj.get_item_basics(solr_rec)
                if item_ok:
                    if uris_only:
                        item = rec_props_obj.uri
                    else:
                        rec_props_obj.parse_solr_record(solr_rec)
                        self.mem_cache_obj = rec_props_obj.mem_cache_obj  # add to existing list of entities, reduce lookups
                        item = self.make_item_dict_from_rec_props_obj(rec_props_obj)
                    self.uris.append(item)
        return self.uris

    def make_item_dict_from_rec_props_obj(self, rec_props_obj, cannonical=True):
        """ makes item dictionary object from a record prop obj """
        item = LastUpdatedOrderedDict()
        item['uri'] = rec_props_obj.uri
        if cannonical is False or 'href' in self.rec_attributes:
            item['href'] = rec_props_obj.href
        item['citation uri'] = rec_props_obj.cite_uri
        item['label'] = rec_props_obj.label
        item['project label'] = rec_props_obj.project_label
        if cannonical:
            item['project uri'] = rec_props_obj.project_uri
        else:
            item['project href'] = rec_props_obj.project_href
        item['context label'] = rec_props_obj.context_label
        if cannonical:
            item['context uri'] = rec_props_obj.context_uri
        else:
            item['context href'] = rec_props_obj.context_href
        item['latitude'] = rec_props_obj.latitude
        item['longitude'] = rec_props_obj.longitude
        item['early bce/ce'] = rec_props_obj.early_date
        item['late bce/ce'] = rec_props_obj.late_date
        item['item category'] = rec_props_obj.category
        if rec_props_obj.snippet is not False:
            item['snippet'] = rec_props_obj.snippet
        if rec_props_obj.thumbnail_scr is not False:
            item['thumbnail'] = rec_props_obj.thumbnail_scr
        if rec_props_obj.preview_scr is not False:
            item['preview'] = rec_props_obj.preview_scr
        if rec_props_obj.fullfile_scr is not False:
            item['primary-file'] = rec_props_obj.fullfile_scr
        item['published'] = rec_props_obj.published
        item['updated'] = rec_props_obj.updated
        if isinstance(rec_props_obj.other_attributes, list):
            for attribute in rec_props_obj.other_attributes:
                prop_key = attribute['property']
                prop_key = rec_props_obj.prevent_attribute_key_collision(item,
                                                                         prop_key)
                if self.flatten_rec_attributes:
                    if 'value' in attribute:
                        item[prop_key] = attribute['value']
                    elif 'values_list' in attribute:
                        item[prop_key] = RecordProperties.ATTRIBUTE_DELIM.join(attribute['values_list'])
                else:
                    item[prop_key] = attribute['values_list']
        return item

    def extract_solr_recs(self, solr_json):
        """ extracts solr_recs along with
           some basic metadata from solr_json
        """
        solr_recs = False
        if isinstance(solr_json, dict):
            try:
                self.total_found = solr_json['response']['numFound']
            except KeyError:
                self.total_found = False
            try:
                self.rec_start = solr_json['response']['start']
            except KeyError:
                self.rec_start = False
            try:
                self.highlighting = solr_json['highlighting']
            except KeyError:
                self.highlighting = False
            try:
                solr_recs = solr_json['response']['docs']
            except KeyError:
                solr_recs = False
        return solr_recs

    def get_media_thumbs(self, solr_recs):
        """ gets media thumbnail items """
        thumb_results = {}
        not_media_uuids = []
        media_uuids = []
        rec_props_obj = RecordProperties(self.response_dict_json)
        for solr_rec in solr_recs:
            item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
            if item is not False:
                uuid = item['uuid']
                if item['item_type'] != 'media':
                    not_media_uuids.append(uuid)
                else:
                    media_uuids.append(uuid)
                thumb_results[uuid] = False
        if len(not_media_uuids) > 0:
            if self.do_media_thumbs:
                # only get media_thumbnails if needed
                rows = self.get_thumbs_for_non_media(not_media_uuids)
                for row in rows:
                    uuid = row['uuid']
                    thumb_obj = {}
                    thumb_obj['href'] = self.base_url + '/media/' + row['media_uuid']
                    thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + row['media_uuid']
                    thumb_obj['scr'] = row['file_uri']
                    if thumb_results[uuid] is False:
                        thumb_results[uuid] = thumb_obj
        if len(media_uuids) > 0:
            thumbs = Mediafile.objects\
                              .filter(uuid__in=media_uuids,
                                      file_type='oc-gen:thumbnail')
            for thumb in thumbs:
                uuid = thumb.uuid
                thumb_obj = {}
                thumb_obj['href'] = self.base_url + '/media/' + thumb.uuid
                thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + thumb.uuid
                thumb_obj['scr'] = thumb.file_uri
                thumb_results[uuid] = thumb_obj
        return thumb_results

    def get_all_media_files(self, solr_recs):
        """ gets media thumbnail items """
        media_file_results = {}
        if self.get_all_media:
            media_uuids = []
            rec_props_obj = RecordProperties(self.response_dict_json)
            for solr_rec in solr_recs:
                item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
                if item is not False:
                    uuid = item['uuid']
                    if item['item_type'] == 'media':
                        media_uuids.append(uuid)
                    media_file_results[uuid] = False
            if len(media_uuids) > 0:
                media_files = Mediafile.objects\
                                       .filter(uuid__in=media_uuids)
                for media_file in media_files:
                    uuid = media_file.uuid
                    if uuid not in media_file_results:
                        media_file_results[uuid] = {}
                    else:
                        if media_file_results[uuid] is False:
                            media_file_results[uuid] = {}
                    media_file_results[uuid][media_file.file_type] = media_file.file_uri
        return media_file_results

    def get_thumbs_for_non_media(self, uuid_list):
        q_uuids = self.make_query_uuids(uuid_list)
        query = ('SELECT ass.uuid AS uuid, m.file_uri AS file_uri, '
                 'm.uuid AS media_uuid '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_mediafiles AS m ON ass.object_uuid = m.uuid '
                 'AND m.file_type=\'oc-gen:thumbnail\'  '
                 'WHERE ass.uuid IN (' + q_uuids + ') '
                 'GROUP BY ass.uuid,  m.file_uri, m.uuid; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows

    def make_query_uuids(self, uuid_list):
        """ makes a string for uuid list query """
        uuid_q = []
        for uuid in uuid_list:
            uuid = '\'' + uuid + '\''
            uuid_q.append(uuid)
        return ', '.join(uuid_q)

    def dictfetchall(self, cursor):
        """ Return all rows from a cursor as a dict """
        columns = [col[0] for col in cursor.description]
        return [
            dict(zip(columns, row))
            for row in cursor.fetchall()
        ]

    def get_string_rec_attributes(self, solr_recs):
        """ gets string record attributes from the database.
            The solr index does not keep string-fields in memory
        """
        output = {}
        str_attribs = {}
        for attribute in self.rec_attributes:
            entity = self.mem_cache_obj.get_entity(attribute, False)
            if entity is not False:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                if entity.data_type == 'xsd:string':
                    str_attribs[attribute] = entity
        if len(str_attribs) > 0:
            uuid_list = []
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = str(solr_rec['uuid'])
                    uuid_list.append(uuid)
            output = self.get_string_attributes(uuid_list, str_attribs)
        return output

    def get_string_attributes(self, uuid_list, str_attribute_ent_dict):
        """ Gets string attribute data for a solr dict """
        output = {}
        pred_uuid_list = []
        pred_uuid_objs = {}
        for key, entity in str_attribute_ent_dict.items():
            if isinstance(entity.uuid, str):
                # add string predicate entity uuid to the list
                pred_uuid_list.append(entity.uuid)
                pred_uuid_objs[entity.uuid] = {'rec_attribute': key,
                                               'property': entity.label,
                                               'pred_uuid': entity.uuid,
                                               'slug': entity.slug}
        if len(pred_uuid_list) > 0 and len(uuid_list) > 0:
            q_rows = self. get_string_attributes_sql(uuid_list, pred_uuid_list)
            dict_rows = {}
            for row in q_rows:
                # print(str(row))
                # the whole "dict row" bullshit is because for some reason
                # we can't simply append to the output of the 
                uuid = row['uuid']
                pred_uuid = row['predicate_uuid']
                content = row['content']
                if uuid not in dict_rows:
                    dict_rows[uuid] = {}
                if pred_uuid not in dict_rows[uuid]:
                    dict_rows[uuid][pred_uuid] = []
                if isinstance(content, str):
                    dict_rows[uuid][pred_uuid].append(content)
                    # print(str(dict_rows[uuid][pred_uuid]))
            output = {'pred_ents': pred_uuid_objs,
                      'data': dict_rows}
        return output

    def get_string_attributes_sql(self, uuid_list, pred_uuid_list):
        """ executes SQL query to get strings for the solr uuids and predicates """
        q_uuids = self.make_query_uuids(uuid_list)
        p_uuids = self.make_query_uuids(pred_uuid_list)
        query = ('SELECT ass.uuid AS uuid, ass.predicate_uuid AS predicate_uuid, '
                 's.content AS content '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_strings AS s ON ass.object_uuid = s.uuid '
                 'WHERE ass.uuid IN (' + q_uuids + ') AND '
                 'ass.predicate_uuid IN (' + p_uuids + ')'
                 'ORDER BY ass.uuid,  ass.predicate_uuid, s.content; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows
Exemplo n.º 2
0
class RecordProperties():
    """ Methods to make properties for individual record items
        useful for making geospatial feature records or
        lists of items without geospatial data
    """
    ATTRIBUTE_DELIM = '; '  # delimiter for multiple attributes

    def __init__(self, request_dict_json=False):
        self.uuid = False
        self.uri = False  # cannonical uri for the item
        self.href = False  # link to the item in the current deployment
        self.cite_uri = False  # stable / persistent uri
        self.label = False
        self.item_type = False
        self.updated = False
        self.published = False
        self.project_href = False  # link to the project in current deployment
        self.project_uri = False  # cannonical uri for the project
        self.project_label = False 
        self.context_href = False  # link to parent context in current deployment
        self.context_uri = False  # link to parent context cannonical uri
        self.context_label = False
        self.category = False
        self.latitude = False
        self.longitude = False
        self.geojson = False
        self.early_date = False
        self.late_date = False
        self.thumbnail_href = False
        self.thumbnail_uri = False
        self.thumbnail_scr = False
        self.preview_scr = False
        self.fullfile_scr = False
        self.snippet = False
        self.cite_uri = False  # stable identifier as an HTTP uri
        self.other_attributes = False  # other attributes to the record
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.attribute_hierarchies = {}
        self.base_url = settings.CANONICAL_HOST
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.mem_cache_obj = MemoryCache()  # memory caching object
        self.request_dict_json = request_dict_json
        if request_dict_json is not False:
            self.request_dict = json.loads(request_dict_json)
        else:
            self.request_dict = False
        self.highlighting = False
        self.recursive_count = 0
        self.min_date = False
        self.max_date = False
        self.thumbnail_data = {}
        self.media_file_data = {}
        self.string_attrib_data = {}

    def parse_solr_record(self, solr_rec):
        """ Parses a solr rec object """
        if isinstance(solr_rec, dict):
            self.get_item_basics(solr_rec)
            self.get_citation_uri(solr_rec)
            self.get_lat_lon(solr_rec)
            self.get_category(solr_rec)
            self.get_project(solr_rec)
            self.get_context(solr_rec)
            self.get_time(solr_rec)  # get time information, limiting date ranges to query constaints
            self.get_thumbnail(solr_rec)
            self.get_media_files(solr_rec)
            self.get_snippet(solr_rec)  # get snippet of highlighted text
            self.get_attributes(solr_rec)  # get non-standard attributes
            self.get_string_attributes(solr_rec)  # get non-standard string attributes

    def get_item_basics(self, solr_rec):
        """ get basic metadata for an item """
        output = False
        if isinstance(solr_rec, dict):
            if 'uuid' in solr_rec:
                self.uuid = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    output = True
                    self.uri = self.make_url_from_val_string(id_parts['uri'], True)
                    self.href = self.make_url_from_val_string(id_parts['uri'], False)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True)
                    self.item_type = item_type_output['item_type']
                    self.label = id_parts['label']
            if 'updated' in solr_rec:
                self.updated = solr_rec['updated']
            if 'published' in solr_rec:
                self.published = solr_rec['published']
        return output

    def get_snippet(self, solr_rec):
        """ get a text highlighting snippet """
        if isinstance(self.highlighting, dict):
            if self.uuid is False:
                if 'uuid' in solr_rec:
                    self.uuid = solr_rec['uuid']
            if self.uuid in self.highlighting:
                if 'text' in self.highlighting[self.uuid]:
                    text_list = self.highlighting[self.uuid]['text']
                    self.snippet = ' '.join(text_list)
                    # some processing to remove fagments of HTML markup.
                    self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]')
                    self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]')
                    try:
                        self.snippet = '<div>' + self.snippet + '</div>'
                        self.snippet = lxml.html.fromstring(self.snippet).text_content()
                        self.snippet = strip_tags(self.snippet)
                    except:
                        self.snippet = strip_tags(self.snippet)
                    self.snippet = self.snippet.replace('[[[[mark]]]]', '<em>')
                    self.snippet = self.snippet.replace('[[[[/mark]]]]', '</em>')

    def get_citation_uri(self, solr_rec):
        """ gets the best citation / persistent uri for the item """
        if 'persistent_uri' in solr_rec:
            for p_uri in solr_rec['persistent_uri']:
                self.cite_uri = p_uri
                if 'dx.doi.org' in p_uri:
                    break  # stop looking once we have a DOI, the best

    def get_lat_lon(self, solr_rec):
        """ gets latitute and longitude information """
        if 'discovery_geolocation' in solr_rec:
            geo_strings = solr_rec['discovery_geolocation']
            geo_coords_str = geo_strings.split(',')
            # NOT geojson ording, since solr uses lat/lon ordering
            self.latitude = float(geo_coords_str[0])
            self.longitude = float(geo_coords_str[1]) 

    def get_category(self, solr_rec):
        """ Gets the most specific category for the item """
        self.recursive_count = 0
        cat_hierarchy = self.get_category_hierarchy(solr_rec)
        if len(cat_hierarchy) > 0:
            self.category = cat_hierarchy[-1]['label']

    def get_context(self, solr_rec):
        """ Get the most specific context parent for the record """
        self.recursive_count = 0
        contexts = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_CONTEXT_SOLR,
                                          '___context',
                                          [])
        if len(contexts) > 0:
            self.context_label = self.make_context_path_label(contexts)
            self.context_uri = self. make_context_link(contexts, True)
            self.context_href = self. make_context_link(contexts, False)

    def get_project(self, solr_rec):
        """ Get the most specific project for the record """
        self.recursive_count = 0
        projects = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_PROJECT_SOLR,
                                          '___project',
                                          [])
        if len(projects) > 0:
            self.project_label = projects[-1]['label']
            self.project_uri = self.make_url_from_val_string(projects[-1]['uri'],
                                                             True)
            self.project_href = self.make_url_from_val_string(projects[-1]['uri'],
                                                              False)

    def get_time(self, solr_rec):
        """ parses time information """
        early_list = False
        late_list = False
        if 'form_use_life_chrono_earliest' in solr_rec:
            early_list = solr_rec['form_use_life_chrono_earliest']
        if 'form_use_life_chrono_latest' in solr_rec:
            late_list = solr_rec['form_use_life_chrono_latest']
        if isinstance(early_list, list):
            date_list = early_list
        else:
            date_list = []
        if isinstance(late_list, list):
            date_list += late_list
        if len(date_list) > 0:
            min_max = self.get_list_min_max(date_list)
            self.early_date = min(min_max)
            self.late_date = max(min_max)

    def get_list_min_max(self, date_list):
        """ Returns the minimum and maximum dates
            from a date list, constrained by
            preset min and max dates
        """
        min_date = False
        max_date = False
        # print(str(date_list))
        if isinstance(date_list, list):
            date_list.sort()
            for date in date_list:
                if self.min_date is not False:
                    if date >= self.min_date \
                       and min_date is False:
                        min_date = date
                if self.max_date is not False:
                    if date <= self.max_date:
                        max_date = date
        if min_date is False:
            min_date = self.min_date
        if max_date is False:
            max_date = self.max_date
        return [min_date, max_date]

    def get_thumbnail(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.thumbnail_data:
                if self.thumbnail_data[uuid] is not False:
                    self.thumbnail_href = self.thumbnail_data[uuid]['href']
                    self.thumbnail_uri = self.thumbnail_data[uuid]['uri']
                    self.thumbnail_scr = self.thumbnail_data[uuid]['scr']
                    rp = RootPath()
                    self.thumbnail_scr = rp.convert_to_https(self.thumbnail_scr)
            else:
                # did not precache thumbnail data, get an indivitual record
                self.get_thumbnail_from_database(solr_rec)

    def get_media_files(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.media_file_data:
                if self.media_file_data[uuid] is not False:
                    rp = RootPath()
                    for file_type, file_uri in self.media_file_data[uuid].items():
                        if file_type == 'oc-gen:thumbnail':
                            self.thumbnail_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:preview':
                            self.preview_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:fullfile':
                            self.fullfile_scr = rp.convert_to_https(file_uri)

    def get_thumbnail_from_database(self, solr_rec):
        """ get media record and thumbnail, if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            thumb = []
            if self.item_type != 'media':
                media_item = Assertion.objects\
                                      .filter(uuid=uuid,
                                              object_type='media')[:1]
                if len(media_item) > 0:
                    muuid = media_item[0].object_uuid
                    thumb = Mediafile.objects\
                                     .filter(uuid=muuid,
                                             file_type='oc-gen:thumbnail')[:1]
            else:
                # do this for media items
                muuid = uuid
                thumb = Mediafile.objects\
                                 .filter(uuid=uuid,
                                         file_type='oc-gen:thumbnail')[:1]
            if len(thumb) > 0:
                self.thumbnail_href = self.base_url + '/media/' + muuid
                self.thumbnail_uri = settings.CANONICAL_HOST + '/media/' + muuid
                self.thumbnail_scr = thumb[0].file_uri

    def get_category_hierarchy(self, solr_rec):
        """ gets the most specific category
            informtation about
            an item
        """
        cat_hierarchy = []
        if 'item_type' in solr_rec:
            item_type = solr_rec['item_type'][0]
            root_cat_field = 'oc_gen_' + item_type + '___pred_id'
            cat_hierarchy = self.extract_hierarchy(solr_rec,
                                                   root_cat_field,
                                                   '___pred',
                                                   [])
        return cat_hierarchy

    """ The following seciton of code
        processes non-default attributes for records
    """
    def get_attributes(self, solr_rec):
        """ gets attributes for a record, based on the
            predicates requested in the search
            and optional predicates passed by a client
            with a GET request with parameter 'attributes'
        """
        qm = QueryMaker()
        solr_field_entities = {}
        for attribute in self.rec_attributes:
            entity = self.mem_cache_obj.get_entity(attribute, False)
            if entity is not False:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                field_parts = qm.make_prop_solr_field_parts(entity)
                solr_field = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                # print('Found: ' + solr_field)
                # extract children of the solr_field so we know if
                # we have the most specific attributes, then we can get
                # values for the most specific attributes
                self.extract_attribute_children(solr_rec, solr_field)
        self.clean_attribute_hiearchies()
        if isinstance(self.attribute_hierarchies, dict):
            self.other_attributes = []
            for field_slug_key, values in self.attribute_hierarchies.items():
                entity = self.mem_cache_obj.get_entity(field_slug_key, False)
                if entity is not False:
                    attribute_dict = LastUpdatedOrderedDict()
                    attribute_dict['property'] = entity.label
                    attribute_dict['values_list'] = []
                    attribute_dict['value'] = ''
                    string_val = False
                    delim = ''
                    for val in values:
                        if isinstance(val, str):
                            string_val = True
                            parsed_val = self.parse_solr_value_parts(val)
                            attribute_dict["values_list"].append(parsed_val['label'])
                            attribute_dict['value'] += delim + str(parsed_val['label'])
                        else:
                            attribute_dict["values_list"].append(val)
                            attribute_dict['value'] += delim + str(val)
                        delim = self.ATTRIBUTE_DELIM
                    if len(values) == 1 \
                       and string_val is False:
                        attribute_dict['value'] = values[0]
                    self.other_attributes.append(attribute_dict)

    def get_string_attributes(self, solr_rec):
        """ gets string attributes for a solr rec, from a previous database query
            needed because solr does not cache string field data
        """
        if isinstance(self.string_attrib_data, dict):
            # now add predicate attributes for string predicates, from the database
            if 'uuid' in solr_rec and 'data' in self.string_attrib_data:
                uuid = solr_rec['uuid']
                if uuid in self.string_attrib_data['data']:
                    item_data = self.string_attrib_data['data'][uuid]
                    for pred_uuid, values_list in item_data.items():
                        act_attribute = self.string_attrib_data['pred_ents'][pred_uuid]
                        act_attribute['values_list'] = values_list
                        act_attribute['value'] = self.ATTRIBUTE_DELIM.join(values_list)
                        self.other_attributes.append(act_attribute)

    def prevent_attribute_key_collision(self, item_prop_dict, prop_key):
        """ checks to make sure there's no collision between the prop_key
            and the dict that it will be added to
        """
        i = 2
        output_prop_key = prop_key
        while output_prop_key in item_prop_dict:
            output_prop_key = prop_key + '[' + str(i) + ']'
            i += 1
        return output_prop_key

    def clean_attribute_hiearchies(self):
        """ some post-processing to make sure
            we have clean attribute hierarchies
        """
        if isinstance(self.attribute_hierarchies, dict):
            # print('check: ' + str(self.attribute_hierarchies))
            temp_attribute_hierarchies = self.attribute_hierarchies
            clean_attribute_hiearchies = {}
            for solr_field_key, field_char in self.attribute_hierarchies.items():
                if field_char['most-specific']:
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    specific_ok = True
                    for val in field_char['values']:
                        if isinstance(val, str):
                            #  print('check:' + solr_field_key + ' val: ' + val)
                            parsed_val = self.parse_solr_value_parts(val)
                            check_field = parsed_val['slug'].replace('-', '_')
                            check_field += '___pred_' + parsed_val['data_type']
                            if check_field in temp_attribute_hierarchies:
                                # note a field is NOT at the most specific level
                                specific_ok = False
                            else:
                                # now check a version with the predicate as part of
                                # the solr field
                                check_field = parsed_val['slug'].replace('-', '_')
                                check_field += pred_suffix
                                if check_field in temp_attribute_hierarchies:
                                    # note a field is NOT at the most specific level
                                    specific_ok = False
                    if specific_ok:
                        # ok to add
                        # print('checked OK: ' + solr_field_key)
                        clean_attribute_hiearchies[solr_field_key] = field_char
            # now that we got rid of problem fields, lets sort these for consistent
            # rendering
            self.attribute_hierarchies = LastUpdatedOrderedDict()
            keys = LastUpdatedOrderedDict()
            # order of key types, we want id fields, followed by numeric then date
            key_types = ['___pred_id',
                         '___pred_numeric',
                         '___pred_date']
            for key_type in key_types:
                keys[key_type] = []
                for solr_field_key, field_char in clean_attribute_hiearchies.items():
                    if key_type in solr_field_key:
                        keys[key_type].append(solr_field_key)
                # sort alphabetically. Slugs useful, since they will cluster predicates
                # from similar vocabularies
                keys[key_type].sort()
                for key in keys[key_type]:
                    field_char = clean_attribute_hiearchies[key]
                    field_ex = key.split('___')
                    # the penultimate part is the predicate
                    field_slug = field_ex[-2].replace('_', '-')
                    if field_slug not in self.attribute_hierarchies:
                        self.attribute_hierarchies[field_slug] = []
                    for val in field_char['values']:
                        if val not in self.attribute_hierarchies[field_slug]:
                            self.attribute_hierarchies[field_slug].append(val)

    def extract_attribute_children(self,
                                   solr_rec,
                                   solr_field_key):
        """ extracts ALL children from the hiearchy of
            a solr_field_key
        """
        is_field = False
        if solr_field_key not in self.attribute_hierarchies:
            # so we don't look at the same thing twice!
            if solr_field_key in solr_rec:
                is_field = True
                field_char = {'most-specific': False,
                              'values': []}
                if '___pred_numeric' in solr_field_key \
                   or '___pred_numeric' in solr_field_key:
                    field_char['most-specific'] = True
                    field_char['values'] = solr_rec[solr_field_key]
                elif '___pred_id' in solr_field_key:
                    # make a suffix for the 
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    childless_children = []
                    for child_val in solr_rec[solr_field_key]:
                        # print('Child: ' + solr_field_key + ': ' + child_val)
                        parsed_path_item = self.parse_solr_value_parts(child_val)
                        new_field_prefix = parsed_path_item['slug'].replace('-', '_')
                        new_field_key = new_field_prefix + '___pred_' + parsed_path_item['data_type']
                        if parsed_path_item['data_type'] == 'id':
                            child_is_field = self.extract_attribute_children(solr_rec,
                                                                             new_field_key)
                            if child_is_field is False:
                                # now check an alternative combining the child
                                # slug with the predicate of the parent
                                new_field_key = new_field_prefix + pred_suffix
                                # print('check: ' + new_field_key)
                                child_is_field = self.extract_attribute_children(solr_rec,
                                                                                 new_field_key)
                                if child_is_field is False:
                                    childless_children.append(child_val)
                    if len(childless_children) > 0:
                        field_char['most-specific'] = True
                        field_char['values'] = childless_children
                else:
                    pass
                self.attribute_hierarchies[solr_field_key] = field_char
        return is_field

    def extract_hierarchy(self,
                          solr_rec,
                          facet_field_key,
                          facet_suffix,
                          hierarchy=[],
                          pred_field=False):
        """ extracts a hierarchy from a solr_record.
            The output is a list starting with the most
            general parent of the hiearchy,
            then going to the most specific

            This is a recursive function and
            default / starts with the root
            of the hiearchy as the facet_field_key

            This only follows a single path (not multiple paths)
        """
        alt_facet_field_key = facet_field_key
        if pred_field is not False:
            # do this to allow search of hiarchy in a named
            # predicate field
            f_parts = facet_field_key.split('___')
            if len(f_parts) == 2:
                alt_f_parts = [f_parts[0],
                               pred_field.replace('-', '_'),
                               f_parts[1]]
                alt_facet_field_key = '___'.join(alt_f_parts)
                # print('Check: ' + facet_field_key + ', ' + alt_facet_field_key)
        if (facet_field_key in solr_rec or alt_facet_field_key in solr_rec)\
           and self.recursive_count < 20:
            self.recursive_count += 1
            if facet_field_key in solr_rec:
                path_item_val = solr_rec[facet_field_key][0]
            else:
                path_item_val = solr_rec[alt_facet_field_key][0]
            parsed_path_item = self.parse_solr_value_parts(path_item_val)
            if isinstance(parsed_path_item, dict):
                hierarchy.append(parsed_path_item)
                new_facet_field = parsed_path_item['slug'].replace('-', '_')
                new_facet_field += facet_suffix + '_' + parsed_path_item['data_type']
                # print('New hierarchy field: ' + new_facet_field)
                hierarchy = self.extract_hierarchy(solr_rec,
                                                   new_facet_field,
                                                   facet_suffix,
                                                   hierarchy)
        return hierarchy

    def make_context_path_label(self, contexts):
        """ Makes a '/' delimited context
            path for easy human readability
        """
        context_path = False
        if len(contexts) > 0:
            context_labels = []
            for context in contexts:
                context_labels.append(context['label'])
            context_path = '/'.join(context_labels)
        return context_path

    def make_context_link(self, contexts, cannonical=False):
        """ makes a URI for a context """
        context_uri = False
        if len(contexts) > 0:
            context_uri = self.make_url_from_val_string(contexts[-1]['uri'],
                                                        cannonical)
        return context_uri

    def make_url_from_val_string(self,
                                 partial_url,
                                 use_cannonical=True):
        """ parses a solr value if it has
            '___' delimiters, to get the URI part
            string.
            if it's already a URI part, it makes
            a URL
        """
        if use_cannonical:
            base_url = settings.CANONICAL_HOST
        else:
            base_url = self.base_url
        solr_parts = self.parse_solr_value_parts(partial_url)
        if isinstance(solr_parts, dict):
            partial_url = solr_parts['uri']
        if 'http://' not in partial_url \
           and 'https://' not in partial_url:
            url = base_url + partial_url
        else:
            url = partial_url
        return url

    def add_record_fields(self):
        """ adds fields to include in the GeoJSON properties """
        if 'rec-field' in self.response_dict:
            raw_rec_fields = self.response_dict['rec-field'][0]
            if ',' in raw_rec_fields:
                self.record_fields = raw_rec_fields.split(',')
            else:
                self.record_fields = [raw_rec_fields]
        else:
            self.record_fields = []
        return self.record_fields

    def parse_solr_value_parts(self, solr_value):
        """ parses a solr_value string into
            slug, solr-data-type, uri, and label
            parts
        """
        output = False
        if isinstance(solr_value, str):
            if '___' in solr_value:
                solr_ex = solr_value.split('___')
                if len(solr_ex) == 4:
                    output = {}
                    output['slug'] = solr_ex[0]
                    output['data_type'] = solr_ex[1]
                    output['uri'] = solr_ex[2]
                    output['label'] = solr_ex[3]
            else:
                output = solr_value
        else:
            output = solr_value
        return output

    def get_solr_record_uuid_type(self, solr_rec):
        """ get item uuid, label, and type from a solr_rec """
        output = False
        if isinstance(solr_rec, dict):
            output = {'uuid': False,
                      'label': False,
                      'item_type': False}
            if 'uuid' in solr_rec:
                output['uuid'] = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    uri = self.make_url_from_val_string(id_parts['uri'], True)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True)
                    output['item_type'] = item_type_output['item_type']
                    output['label'] = id_parts['label']
        return output

    def get_key_val(self, key, dict_obj):
        """ returns the value associated
            with a key, if the key exists
            else, none
        """
        output = None
        if isinstance(dict_obj, dict):
            if key in dict_obj:
                output = dict_obj[key]
        return output
Exemplo n.º 3
0
class QueryMaker():

    # main item-types mapped to their slugs to get solr-facet field prefix
    TYPE_MAPPINGS = {'subjects': 'oc-gen-subjects',
                     'media': 'oc-gen-media',
                     'documents': 'oc-gen-documents',
                     'persons': 'oc-gen-persons',
                     'projects': 'oc-gen-projects',
                     'types': 'oc-gen-types',
                     'predicates': 'oc-gen-predicates'}

    TYPE_URIS = {'subjects': 'oc-gen:subjects',
                 'media': 'oc-gen:media',
                 'documents': 'oc-gen:documents',
                 'persons': 'oc-gen:persons',
                 'projects': 'oc-gen:projects',
                 'types': 'oc-gen:types',
                 'predicates': 'oc-gen:predicates'}

    def __init__(self):
        self.error = False
        self.histogram_groups = 10
        self.mem_cache_obj = MemoryCache()  # memory caching object

    def _get_context_paths(self, spatial_context):
        '''
        Takes a context path and returns an iterator with the list of possible
        contexts. Parses the list of boolean '||' (OR) and returns a list
        of contexts.

        For example:

        >>> _get_context_paths('Turkey/Domuztepe/I||II||Stray')

        ['Turkey/Domuztepe/I', 'Turkey/Domuztepe/II', 'Turkey/Domuztepe/Stray']

        '''
        # Split the context path by '/' and then by '||'
        context_lists = (value.split('||') for value in
                         spatial_context.split('/'))
        # Create a list of the various permutations
        context_tuple_list = list(itertools.product(*context_lists))
        # Turn the lists back into URIs
        return ('/'.join(value) for value in context_tuple_list)

    def _get_context_depth(self, spatial_context):
        '''
        Takes a context path and returns its depth as an interger. For
        example, the context '/Turkey/Domuztepe'
        would have a depth of 2.
        '''
        # Remove a possible trailing slash before calculating the depth
        return len(spatial_context.rstrip('/').split('/'))

    def _get_valid_context_slugs(self, contexts):
        '''
        Takes a list of contexts and, for valid contexts, returns a list of
        slugs
        '''
        entity = Entity()
        valid_context_slugs = []
        context_list = list(contexts)
        for context in context_list:
            # Verify that the contexts are valid
            # find and save the enity to memory
            # print('check: ' + context)
            found = self.mem_cache_obj.check_entity_found(context,
                                                          True)
            # print('found: ' + str(found))
            if found:
                entity = self.mem_cache_obj.get_entity(context,
                                                       True)
                valid_context_slugs.append(entity.slug)
        return valid_context_slugs

    def _get_parent_slug(self, slug):
        '''
        Takes a slug and returns the slug of its parent. Returns 'root' if
        a slug has no parent.
        '''
        cache_key = self.mem_cache_obj.make_memory_cache_key('par-slug', slug)
        parent_slug = self.mem_cache_obj.get_cache_object(cache_key)
        if parent_slug is None:
            contain_obj = Containment()
            contain_obj.use_cache = False  # because it seems to introduce memory errors
            parent_slug = contain_obj.get_parent_slug_by_slug(slug)
            self.mem_cache_obj.save_cache_object(cache_key, parent_slug)
        if parent_slug:
            return parent_slug
        else:
            return 'root'

    def _prepare_filter_query(self, parent_child_slug):
        # TODO docstring
        parent_child_set = parent_child_slug.split('___')
        return parent_child_set[0].replace('-', '_') + '___context_id_fq:' + \
            parent_child_set[1]

    def expand_hierarchy_options(self,
                                 path_param_val,
                                 hier_delim='---',
                                 or_delim='||'):
        """ Exapands a hiearchic path string into a
            list of listed hierachically ordered items.
            This method also makes a new hiearchic ordered
            list if there is an 'or_delim'.
        """
        if isinstance(path_param_val, list):
            inital_path_list = path_param_val
        else:
            inital_path_list = [path_param_val]
        path_list = []
        for path_string in inital_path_list:
            raw_path_list = (value.split(or_delim) for value in
                             path_string.split(hier_delim))
            # Create a list of the various permutations
            path_tuple_list = list(itertools.product(*raw_path_list))
            for item in path_tuple_list:
                path_list.append(list(item))
        return path_list

    def get_solr_field_type(self, data_type, prefix=''):
        '''
        Defines whether our dynamic solr fields names for
        predicates end with ___pred_id, ___pred_numeric, etc.
        '''
        if data_type in ['@id', 'id', False]:
            return prefix + 'id'
        elif data_type in ['xsd:integer', 'xsd:double', 'xsd:boolean']:
            return prefix + 'numeric'
        elif data_type == 'xsd:string':
            return prefix + 'string'
        elif data_type == 'xsd:date':
            return prefix + 'date'
        else:
            raise Exception("Error: Unknown predicate type")

    def make_prop_solr_field_parts(self, entity):
        """ Makes a solr field for a property """
        output = {}
        output['prefix'] = entity.slug.replace('-', '_')
        output['suffix'] = self.get_solr_field_type(entity.data_type)
        return output

    def process_proj(self, proj_path):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        project_path_lists = self.expand_hierarchy_options(proj_path)
        for proj_path_list in project_path_lists:
            i = 0
            path_list_len = len(proj_path_list)
            fq_field = SolrDocument.ROOT_PROJECT_SOLR
            fq_path_terms = []
            for proj_slug in proj_path_list:
                found = self.mem_cache_obj.check_entity_found(proj_slug, False)
                if found:
                    entity = self.mem_cache_obj.get_entity(proj_slug, False)
                    # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                    # the below is a bit of a hack. We should have a query field
                    # as with ___pred_ to query just the slug. But this works for now
                    proj_slug = entity.slug
                    fq_path_term = fq_field + ':' + proj_slug + '*'
                else:
                    fq_path_term = fq_field + ':' + proj_slug
                fq_path_terms.append(fq_path_term)
                fq_field = proj_slug.replace('-', '_') + '___project_id'
                i += 1
                if i >= path_list_len and fq_field not in query_dict['facet.field']:
                    query_dict['facet.field'].append(fq_field)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_ld_object(self, objects):
        # TODO docstring
        query_dict = {'fq': []}
        fq_terms = []
        if not isinstance(objects, list):
            objects = [objects]
        for raw_obj in objects:
            if '||' in raw_obj:
                or_objects = raw_obj.split('||')
            else:
                or_objects = [raw_obj]
            fq_or_terms = []
            for obj in or_objects:
                # find and save the entity to memory
                found = self.mem_cache_obj.check_entity_found(obj, False)
                if found:
                    entity = self.mem_cache_obj.get_entity(obj, False)
                    fq_term = 'object_uri:' + self.escape_solr_arg(entity.uri)
                    fq_term += ' OR text:"' + self.escape_solr_arg(entity.uri) + '"'
                else:
                    fq_term = 'object_uri:' + obj
                fq_or_terms.append(fq_term)
            fq_all_ors = ' OR '.join(fq_or_terms)
            fq_all_ors = '(' + fq_all_ors + ')'
            fq_terms.append(fq_all_ors)
        fq_final = ' AND '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_dc_term(self, dc_param, dc_terms, add_facet=False):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        if dc_param in DCterms.DC_META_FIELDS:
            fq_field = DCterms.DC_META_FIELDS[dc_param]
            if fq_field not in query_dict['facet.field'] and add_facet:
                query_dict['facet.field'].append(fq_field)
            add_to_fq = False
            for raw_dc_term in dc_terms:
                if '||' in raw_dc_term:
                    use_dc_terms = raw_dc_term.split('||')
                else:
                    use_dc_terms = [raw_dc_term]
                fq_path_terms = []
                for dc_term in use_dc_terms:
                    if len(dc_term) > 0:
                        add_to_fq = True
                        # check if entity exists, and or store in memory
                        found = self.mem_cache_obj.check_entity_found(dc_term, False)
                        if found:
                            # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                            # the below is a bit of a hack. We should have a query field
                            # as with ___pred_ to query just the slug. But this works for now
                            entity = self.mem_cache_obj.get_entity(dc_term, False)
                            fq_path_term = fq_field + '_fq:' + entity.slug
                            if dc_param == 'dc-temporal' \
                               and entity.entity_type == 'vocabulary' \
                               and 'periodo' in entity.slug:
                                # it's a temporal vocabulary from periodo
                                # so search for specific periods contained in
                                # the vocabulary
                                fq_path_term = '(' + fq_path_term +\
                                               ' OR ' + fq_path_term + '*)'
                        else:
                            if dc_term[-1] != '*':
                                dc_term += '*'
                            fq_path_term = fq_field + ':' + dc_term
                        fq_path_terms.append(fq_path_term)
                final_path_term = ' AND '.join(fq_path_terms)
                final_path_term = '(' + final_path_term + ')'
                fq_terms.append(final_path_term)
            fq_final = ' OR '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            if add_to_fq:
                query_dict['fq'].append(fq_final)
        return query_dict

    def get_related_slug_field_prefix(self, slug):
        """ gets the field prefix for a related property
            if it is present in the slug, 
            then return the solr_field prefix otherwise
            return a '' string
        """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            return field_prefix
        else:
            return ''

    def clean_related_slug(self, slug):
        """ removes the field_prefix for related slugs """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            slug = slug[prefix_len:]
        return slug

    def correct_solr_prefix_for_fq(self, solr_f_prefix, act_field_fq):
        """ makes sure the solr prefix is on the fq if needed """
        if solr_f_prefix != '':
            if solr_f_prefix not in act_field_fq:
                act_field_fq = solr_f_prefix + act_field_fq
        return act_field_fq

    def process_prop(self, props):
        """ processes 'prop' (property) parameters
            property parameters are tricky because they
            can come in hierarchies
            that's why there's some complexity to this
        """
        # is the property for the item itself, or for a related item?
        query_dict = {'fq': [],
                      'facet.field': [],
                      'stats.field': [],
                      'prequery-stats': [],
                      'facet.range': [],
                      'hl-queries': [],
                      'ranges': {}}
        fq_terms = []
        prop_path_lists = self.expand_hierarchy_options(props)
        for prop_path_list in prop_path_lists:
            i = 0
            path_list_len = len(prop_path_list)
            fq_path_terms = []
            act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
            act_field_data_type = 'id'
            last_field_label = False  # needed for full text highlighting
            predicate_solr_slug = False
            for prop_slug in prop_path_list:
                field_prefix = self.get_related_slug_field_prefix(prop_slug)
                solr_f_prefix = field_prefix.replace('-', '_')
                db_prop_slug = self.clean_related_slug(prop_slug)
                l_prop_entity = False
                pred_prop_entity = False
                require_id_field = False
                if act_field_data_type == 'id':
                    # check entity exists, and save to memory
                    found = self.mem_cache_obj.check_entity_found(db_prop_slug, False)
                    if found:
                        entity = self.mem_cache_obj.get_entity(db_prop_slug, False)
                        last_field_label = entity.label
                        prop_slug = field_prefix + entity.slug
                        if entity.item_type == 'uri' and 'oc-gen' not in db_prop_slug:
                            if entity.entity_type == 'property':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                l_prop_entity = True
                                children = self.mem_cache_obj.get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        else:
                            if entity.item_type == 'predicates':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                children = self.mem_cache_obj.get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        if i == 0:
                            if 'oc-gen' in db_prop_slug:
                                # for open context categories / types
                                act_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                lr = LinkRecursion()
                                parents = lr.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        act_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                        act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                                    except:
                                        pass
                            elif entity.item_type == 'uri':
                                act_field_fq = SolrDocument.ROOT_LINK_DATA_SOLR
                            elif entity.item_type == 'predicates':
                                temp_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                parents = self.mem_cache_obj.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        temp_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                    except:
                                        print('Predicate Parent exception: '+ str(parents))
                                        temp_field_fq = False
                                if temp_field_fq is not False:
                                    act_field_fq = temp_field_fq
                                else:
                                    act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                            else:
                                act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                        # ---------------------------------------------------
                        # THIS PART BUILDS THE FACET-QUERY
                        # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                        # the below is a bit of a hack. We should have a query field
                        # as with ___pred_ to query just the slug. But this works for now
                        fq_field = act_field_fq + '_fq'
                        if path_list_len >= 2 and act_field_data_type == 'id':
                            # could be an object deeper in the hierarchy, so allow the obj_all version
                            fq_path_term = '(' + fq_field + ':' + prop_slug
                            fq_path_term += ' OR obj_all___' + fq_field + ':' + prop_slug + ')'
                        else:
                            fq_path_term = fq_field + ':' + prop_slug
                        fq_path_terms.append(fq_path_term)
                        #---------------------------------------------------
                        #
                        #---------------------------------------------------
                        # THIS PART PREPARES FOR LOOPING OR FINAL FACET-FIELDS
                        #
                        # print('pred-solr-slug: ' + predicate_solr_slug)
                        field_parts = self.make_prop_solr_field_parts(entity)
                        act_field_data_type = field_parts['suffix']
                        if require_id_field:
                            act_field_data_type = 'id'
                            field_parts['suffix'] = 'id'
                        # check if the last or penultimate field has
                        # a different data-type (for linked-data)
                        if i >= (path_list_len - 2) \
                           and l_prop_entity:
                            dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                            if isinstance(dtypes, list):
                                # set te data type and the act-field
                                found = self.mem_cache_obj.check_entity_found(db_prop_slug, False)
                                if found:
                                    entity = self.mem_cache_obj.get_entity(db_prop_slug, False)
                                    entity.date_type = dtypes[0]  # store for later use
                                    self.mem_cache_obj.entities[db_prop_slug] = entity  # store for later use
                                act_field_data_type = self.get_solr_field_type(dtypes[0])
                        if predicate_solr_slug is False or pred_prop_entity:
                            act_field_fq = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            # get a facet on this field
                            if act_field_data_type != 'string':
                                # adds a prefix for related properties
                                ffield = solr_f_prefix + field_parts['prefix'] + '___pred_' + field_parts['suffix']
                                if ffield not in query_dict['facet.field'] \
                                   and i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                        else:
                            if act_field_data_type == 'id':
                                act_field_fq = 'obj_all___' + predicate_solr_slug \
                                               + '___pred_' + field_parts['suffix']
                                # get a facet on this field
                                if predicate_solr_slug != field_parts['prefix']:
                                    # the predicate_solr_slug is not the
                                    # prefix of the current field part, meaning
                                    # the field_parts[prefix] is the type, and
                                    # we want facets for the predicate -> type
                                    ffield = field_parts['prefix'] \
                                             + '___' \
                                             + predicate_solr_slug \
                                             + '___pred_' + field_parts['suffix']
                                else:
                                    # get facets for the predicate
                                    ffield = field_parts['prefix'] \
                                             + '___pred_' \
                                             + field_parts['suffix']
                                # adds a prefix, in case of a related property
                                ffield = solr_f_prefix + ffield
                                if ffield not in query_dict['facet.field'] \
                                   and i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                            else:
                                act_field_fq = predicate_solr_slug + '___pred_' + field_parts['suffix']
                        # -------------------------------------------
                        if act_field_data_type == 'numeric':
                            # print('Numeric field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_numeric'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_math_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        elif act_field_data_type == 'date':
                            # print('Date field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_date'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_date_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        # print('Current data type (' + str(i) + '): ' + act_field_data_type)
                        # print('Current field (' + str(i) + '): ' + act_field_fq)
                    i += 1
                elif act_field_data_type == 'string':
                    # case for a text search
                    # last_field_label = False  # turn off using the field label for highlighting
                    string_terms = self.prep_string_search_term(prop_slug)
                    for escaped_term in string_terms:
                        search_term = act_field_fq + ':' + escaped_term
                        if last_field_label is False:
                            query_dict['hl-queries'].append(escaped_term)
                        else:
                            query_dict['hl-queries'].append(last_field_label + ' ' + escaped_term)
                        fq_path_terms.append(search_term)
                elif act_field_data_type == 'numeric':
                    # numeric search. assume it's well formed solr numeric request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the numeric ranges from query to the range facets
                    query_dict = self.add_math_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
                elif act_field_data_type == 'date':
                    # date search. assume it's well formed solr request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the date ranges from query to the range facets
                    query_dict = self.add_date_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def add_math_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = self.histogram_groups
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                vals = []
                # get the numbers out
                q_nums_strs = re.findall(r'[-+]?\d*\.\d+|\d+', solr_query)
                for q_num_str in q_nums_strs:
                    vals.append(float(q_num_str))
                vals.sort()
                if len(vals) > 1:
                    ok = True
                    min_val = vals[0]
                    max_val = vals[-1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = min_val
            query_dict['ranges'][fend] = max_val
            query_dict['ranges'][fgap] = (max_val - min_val) / groups
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def add_date_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = 4
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}[T:]\d{2}:\d{2}:\d{2}', solr_query)
                if len(q_dt_strs) < 2:
                    # try a less strict regular expression to get dates
                    q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}', solr_query)
                if len(q_dt_strs) >= 2:
                    ok = True
                    vals = []
                    for q_dt_str in q_dt_strs:
                        vals.append(q_dt_str)
                    vals.sort()
                    min_val = vals[0]
                    max_val = vals[1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = self.convert_date_to_solr_date(min_val)
            query_dict['ranges'][fend] = self.convert_date_to_solr_date(max_val)
            query_dict['ranges'][fgap] = self.get_date_difference_for_solr(min_val, max_val, groups)
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def get_date_difference_for_solr(self, min_date, max_date, groups):
        """ Gets a solr date difference from two values """
        min_dt = self.date_convert(min_date)
        max_dt = self.date_convert(max_date)
        dif_dt = (max_dt - min_dt) / groups
        if dif_dt.days >= 366:
            solr_val = int(round((dif_dt.days / 365.25), 0))
            solr_dif = '+' + str(solr_val) + 'YEAR'
        elif dif_dt.days >= 31:
            solr_val = int(round((dif_dt.days / 30), 0))
            solr_dif = '+' + str(solr_val) + 'MONTH'
        elif dif_dt.days >= 1:
            solr_val = int(round(dif_dt.days, 0))
            solr_dif = '+' + str(solr_val) + 'DAY'
        elif (dif_dt.seconds // 3600) >= 1:
            solr_val = int(round((dif_dt.seconds // 3600), 0))
            solr_dif = '+' + str(solr_val) + 'HOUR'
        elif ((dif_dt.seconds % 3600) // 60) >= 1:
            solr_val = int(round(((dif_dt.seconds % 3600) // 60), 0))
            solr_dif = '+' + str(solr_val) + 'MINUTE'
        elif dif_dt.seconds >= 1:
            solr_val = int(round(dif_dt.seconds, 0))
            solr_dif = '+' + str(solr_val) + 'SECOND'
        else:
            solr_dif = '+1YEAR'
        return solr_dif

    def add_solr_gap_to_date(self, date_val, solr_gap):
        """ adds a solr gap to a date_val """
        solr_val = re.sub(r'[^\d.]', r'', solr_gap)
        solr_val = int(float(solr_val))
        dt = self.date_convert(date_val)
        if 'YEAR' in solr_gap:
            dt = dt + datetime.timedelta(days=int(round((solr_val * 365.25), 0)))
        elif 'MONTH' in solr_gap:
            dt = dt + datetime.timedelta(days=(solr_val * 30))
        elif 'DAY' in solr_gap:
            dt = dt + datetime.timedelta(days=solr_val)
        elif 'HOUR' in solr_gap:
            dt = dt + datetime.timedelta(hours=solr_val)
        elif 'MINUTE' in solr_gap:
            dt = dt + datetime.timedelta(minutes=solr_val)
        elif 'SECOND' in solr_gap:
            dt = dt + datetime.timedelta(seconds=solr_val)
        else:
            dt = dt
        return dt

    def convert_date_to_solr_date(self, date_val):
        """ Conversts a string for a date into
            a Solr formated datetime string
        """
        dt = self.date_convert(date_val)
        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')

    def make_human_readable_date(self, date_val):
        """ Converts a date value into something
            easier to read
        """
        dt = self.date_convert(date_val)
        check_date = dt.strftime('%Y-%m-%d')
        check_dt = self.date_convert(date_val)
        if check_dt == dt:
            return check_date
        else:
            return dt.strftime('%Y-%m-%d:%H:%M:%S')

    def date_convert(self, date_val):
        """ converts to a python datetime if not already so """
        if isinstance(date_val, str):
            date_val = date_val.replace('Z', '')
            dt = datetime.datetime.strptime(date_val, '%Y-%m-%dT%H:%M:%S')
        else:
            dt = date_val
        return dt

    def get_parent_item_type_facet_field(self, category_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        parents = LinkRecursion().get_jsonldish_entity_parents(category_uri)
        for par in parents:
            if par['slug'] in self.TYPE_MAPPINGS.values():
                # the parent exists in the Type Mappings
                output = par['slug'].replace('-', '_') + '___pred_id'
                break
        return output

    def get_parent_entity_facet_field(self, entity_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        parents = LinkRecursion().get_jsonldish_entity_parents(entity_uri)
        if isinstance(parents, list):
            if len(parents) > 1:
                # get the penultimate field
                output = parents[-2]['slug'].replace('-', '_') + '___pred_id'
        return output

    def process_item_type(self, raw_item_type):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        item_type_lists = self.expand_hierarchy_options(raw_item_type)
        for item_type_list in item_type_lists:
            i = 0
            path_list_len = len(item_type_list)
            fq_path_terms = []
            item_type = item_type_list[0]  # no hiearchy in this field, just the type
            fq_term = 'item_type:' + item_type
            fq_terms.append(fq_term)
            if item_type in self.TYPE_MAPPINGS:
                act_field = self.TYPE_MAPPINGS[item_type].replace('-', '_') + '___pred_id'
                query_dict['facet.field'].append(act_field)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_id(self, identifier):
        # check for identifier
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        escape_id = self.escape_solr_arg(identifier)
        fq_terms.append('persistent_uri:' + escape_id)
        # now make a DOI URI in case this is just a naked DOI
        doi_uri = self.escape_solr_arg('http://dx.doi.org/' + identifier)
        fq_terms.append('persistent_uri:' + doi_uri)
        # now make an ARK URI in case this is just a naked ARK
        ark_uri = self.escape_solr_arg('http://n2t.net/' + identifier)
        fq_terms.append('persistent_uri:' + ark_uri)
        # now make an ORCID URI in case this is just a naked ORCID
        orcid_uri = self.escape_solr_arg('http://orcid.org/' + identifier)
        fq_terms.append('persistent_uri:' + orcid_uri)
        fq_terms.append('uuid:' + escape_id)
        tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True)
        if tcheck is not False:
            uuid = tcheck['uuid']
            fq_terms.append('uuid:' + uuid)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        # print(fq_final)
        return query_dict

    def process_form_use_life_chrono(self, raw_form_use_life_chrono):
        # creates facet query for form-use-life chronological tiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('form_use_life_chrono_tile')
        if '||' in raw_form_use_life_chrono:
            chrono_paths = raw_form_use_life_chrono.split('||')
        else:
            chrono_paths = [raw_form_use_life_chrono]
        for chrono_path in chrono_paths:
            i = 0
            if len(chrono_path) < 30:
                chrono_path += '*'
            fq_term = 'form_use_life_chrono_tile:' + chrono_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_form_date_chrono(self, form_use_life_date, date_type):
        # creates facet query for form-use-life dates
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        if date_type == 'start':
            qterm = '[' + str(form_use_life_date) + ' TO *]'
            fquery = 'form_use_life_chrono_earliest: ' + qterm
        else:
            qterm = '[* TO ' + str(form_use_life_date) + ']'
            fquery = 'form_use_life_chrono_latest: ' + qterm
        query_dict['fq'].append(fquery)
        return query_dict

    def process_discovery_geo(self, raw_disc_geo):
        # creates facet query for discovery geotiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('discovery_geotile')
        if '||' in raw_disc_geo:
            disc_geo_paths = raw_disc_geo.split('||')
        else:
            disc_geo_paths = [raw_disc_geo]
        for disc_path in disc_geo_paths:
            i = 0
            if len(disc_path) < 20:
                disc_path += '*'
            fq_term = 'discovery_geotile:' + disc_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_discovery_bbox(self, raw_disc_bbox):
        # creates facet query for bounding box searches
        # supports or {'||') queries
        query_dict = {'fq': []}
        fq_terms = []
        if '||' in raw_disc_bbox:
            bbox_list = raw_disc_bbox.split('||')
        else:
            bbox_list = [raw_disc_bbox]
        for bbox in bbox_list:
            if ',' in bbox:
                # comma seperated list of coordinates
                bbox_coors = bbox.split(',')
                bbox_valid = self.validate_bbox_coordiantes(bbox_coors)
                if bbox_valid:
                    # valid bounding box, now make a solr-query
                    # not how solr expacts latitude / longitude order, which
                    # is the revserse of geojson!
                    fq_term = 'discovery_geolocation:'
                    fq_term += '[' + str(bbox_coors[1]) + ',' + str(bbox_coors[0])
                    fq_term += ' TO ' + str(bbox_coors[3]) + ',' + str(bbox_coors[2])
                    fq_term += ']'
                    fq_terms.append(fq_term)
        if len(fq_terms) > 0:
            fq_final = ' OR '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            query_dict['fq'].append(fq_final)
        return query_dict

    def validate_bbox_coordiantes(self, bbox_coors):
        """ validates a set of bounding box coordinates """
        is_valid = False
        if len(bbox_coors) == 4:
            lower_left_valid = self.validate_geo_lon_lat(bbox_coors[0],
                                                         bbox_coors[1])
            top_right_valid = self.validate_geo_lon_lat(bbox_coors[2],
                                                        bbox_coors[3])
            # print('ok: ' + str(lower_left_valid) + ' ' + str(top_right_valid))
            if lower_left_valid and top_right_valid:
                if float(bbox_coors[0]) < float(bbox_coors[2]) and\
                   float(bbox_coors[1]) < float(bbox_coors[3]):
                    is_valid = True
        return is_valid

    def validate_geo_lon_lat(self, lon, lat):
        """ checks to see if a lon, lat pair
            are valid. Note the GeoJSON ordering
            of the coordinates
        """
        is_valid = False
        lon_valid = self.validate_geo_coordinate(lon, 'lon')
        lat_valid = self.validate_geo_coordinate(lat, 'lat')
        if lon_valid and lat_valid:
            is_valid = True
        return is_valid

    def validate_geo_coordinate(self, coordinate, coord_type):
        """ validates a geo-spatial coordinate """
        is_valid = False
        try:
            fl_coord = float(coordinate)
        except ValueError:
            fl_coord = False
        if fl_coord is not False:
            if 'lat' in coord_type:
                if fl_coord <= 90 and\
                   fl_coord >= -90:
                    is_valid = True
            elif 'lon' in coord_type:
                if fl_coord <= 180 and\
                   fl_coord >= -180:
                    is_valid = True
        return is_valid

    def make_solr_value_from_entity(self, entity, value_type='id'):
        """ makes a solr value as indexed in SolrDocument
            see _concat_solr_string_value
        """
        id_part = entity.uri
        if 'http://opencontext.org' in entity.uri:
            if '/vocabularies/' not in entity.uri:
                id_part = entity.uri.split('http://opencontext.org')[1]
        return entity.slug + '___' + value_type + '___' + \
            id_part + '___' + entity.label
        return output

    def _process_spatial_context(self, spatial_context=None):
        # TODO docstring
        context = {}
        if spatial_context:
            context_paths = self._get_context_paths(spatial_context)
            context_slugs = self._get_valid_context_slugs(context_paths)
            # print('Context slugs: ' + str(context_slugs))
            # If we cannot find a valid context, raise a 404
            if not context_slugs:
                raise Http404
            # Solr 'fq' parameters
            parent_child_slugs = []
            # Solr 'facet.field' parameters
            facet_field = []
            for slug in context_slugs:
                # fq parameters
                parent_child_slugs.append(self._get_parent_slug(slug) + '___' + slug)
                # facet.field parameters
                facet_field.append(slug.replace('-', '_') + '___context_id')
            # First, handle the most likely scenario of a single context
            if len(parent_child_slugs) == 1:
                context['fq'] = self._prepare_filter_query(parent_child_slugs[0])
            # Otherwise, combine multiple contexts into an OR filter
            else:
                fq_string = ' OR '.join(
                    (self._prepare_filter_query(slug_set) for slug_set
                        in parent_child_slugs)
                    )
                context['fq'] = '(' + fq_string + ')'
            context['facet.field'] = facet_field
        # No spatial context provided
        else:
            context['fq'] = None
            context['facet.field'] = ['root___context_id']
        return context

    def prep_string_search_term(self, raw_term):
        """ prepares a string search
            returns a list of search terms
            for AND queries
        """
        if '"' in raw_term:
            nq_term = raw_term.replace('"', ' ')  # get rid of quotes in the search term
            quoted_list = re.findall(r"\"(.*?)\"", raw_term)
            terms = []
            terms.append(self.escape_solr_arg(nq_term))
            for quote_item in quoted_list:
                quote_item = self.escape_solr_arg(quote_item)  # escape characters
                quote_item = '"' + quote_item + '"'  # put quotes back around it
                terms.append(quote_item)
        else:
            terms = []
            terms.append(self.escape_solr_arg(raw_term))
        return terms

    def escaped_seq(self, term):
        """ Yield the next string based on the
            next character (either this char
            or escaped version """
        escaperules = {'+': r'\+',
                       '-': r'\-',
                       '&': r'\&',
                       '|': r'\|',
                       '!': r'\!',
                       '(': r'\(',
                       ')': r'\)',
                       '{': r'\{',
                       '}': r'\}',
                       '[': r'\[',
                       ']': r'\]',
                       '^': r'\^',
                       '~': r'\~',
                       '*': r'\*',
                       '?': r'\?',
                       ':': r'\:',
                       '"': r'\"',
                       ';': r'\;',
                       ' ': r'\ '}
        for char in term:
            if char in escaperules.keys():
                yield escaperules[char]
            else:
                yield char

    def escape_solr_arg(self, term):
        """ Apply escaping to the passed in query terms
            escaping special characters like : , etc"""
        term = term.replace('\\', r'\\')   # escape \ first
        return "".join([next_str for next_str in self.escaped_seq(term)])