Пример #1
0
    def reindex(self, obj):

        # get the fedoralink's original class from the obj.
        clz = fedoralink_classes(obj)[0]

        if not issubclass(clz, IndexableFedoraObject):
            # can not reindex something which does not have a mapping
            return

        doc_type = self._get_elastic_class(clz)

        indexer_data = {}
        for field in clz._meta.fields:
            data = getattr(obj, field.name)
            if data is None:
                continue

            converted_value = convert(data, field)
            indexer_data[url2id(field.rdf_name)] = converted_value

        encoded_fedora_id = base64.b64encode(str(obj.pk).encode('utf-8')).decode('utf-8')

        indexer_data['_fedora_id'] = obj.pk
        parent = obj[FEDORA.hasParent]
        if parent and (isinstance(parent, list) or isinstance(parent, tuple)):
            parent = parent[0]
        indexer_data['_fedora_parent'] = convert(parent, FEDORA_PARENT_FIELD)
        indexer_data['_fedoralink_model'] = [self._get_elastic_class(x) for x in inspect.getmro(clz)]
        indexer_data['_fedora_type'] = [convert(x, FEDORA_TYPE_FIELD) for x in obj[RDF.type]]
        indexer_data['_fedora_created'] = [convert(x, FEDORA_CREATED_FIELD) for x in obj[FEDORA.created]]
        indexer_data['_fedora_last_modified'] = [convert(x, FEDORA_LAST_MODIFIED_FIELD) for x in
                                                 obj[FEDORA.lastModified]]

        # noinspection PyBroadException
        try:
            self.es.index(index=self.index_name, doc_type=doc_type, body=indexer_data, id=encoded_fedora_id)
        except:
            print("Exception in indexing, data", indexer_data)
            mail_admins('Exception reindexing object %s' % obj.id, traceback.format_exc())
            print("Exception reindexing object %s" % obj.id, traceback.format_exc())
            traceback.print_exc()
Пример #2
0
    def reindex(self, obj):

        # get the fedoralink's original class from the obj.
        clz = fedoralink_classes(obj)[0]

        if not issubclass(clz, IndexableFedoraObject):
            # can not reindex something which does not have a mapping
            return

        doc_type = self._get_elastic_class(clz)

        indexer_data = {}
        for field in clz._meta.fields:
            data = getattr(obj, field.name)
            if data is None:
                continue

            converted_value = convert(data, field)
            indexer_data[url2id(field.rdf_name)] = converted_value

        encoded_fedora_id = base64.b64encode(str(obj.pk).encode('utf-8')).decode('utf-8')

        indexer_data['_fedora_id'] = obj.pk
        indexer_data['_fedora_parent'] = convert(obj[FEDORA.hasParent], FEDORA_PARENT_FIELD)
        indexer_data['_fedoralink_model'] = [ self._get_elastic_class(x) for x in inspect.getmro(clz) ]
        indexer_data['_fedora_type'] = [ convert(x, FEDORA_TYPE_FIELD) for x in obj[RDF.type] ]
        indexer_data['_fedora_created'] = [ convert(x, FEDORA_CREATED_FIELD) for x in obj[FEDORA.created] ]
        indexer_data['_fedora_last_modified'] = [ convert(x, FEDORA_LAST_MODIFIED_FIELD) for x in obj[FEDORA.lastModified] ]

        # noinspection PyBroadException
        try:
            self.es.index(index=self.index_name, doc_type=doc_type, body=indexer_data, id=encoded_fedora_id)
        except:
            print("Exception in indexing, data", indexer_data)
            traceback.print_exc()
        print("reindexing single object ok")
Пример #3
0
    def search(self, query, model_class, start, end, facets, ordering, values):
        self._de_morgan(query)
        self._flatten_query(query)

        fld2id = {}
        id2fld = {}
        id2fldlang = {}
        for fld in model_class._meta.fields:
            id_in_elasticsearch = url2id(fld.rdf_name)

            if isinstance(fld, IndexedLanguageField):
                for lang in settings.LANGUAGES:
                    nested_id_in_elasticsearch = id_in_elasticsearch + '.' + lang[0]

                    fld2id[fld.name + '.' + lang[0]] = nested_id_in_elasticsearch
                    id2fld[nested_id_in_elasticsearch] = fld.name
                    id2fldlang[nested_id_in_elasticsearch] = fld.name + '@' + lang[0]

            fld2id[fld.name] = id_in_elasticsearch
            id2fld[id_in_elasticsearch] = fld.name
            id2fldlang[id_in_elasticsearch] = fld.name

        for extra_fld in ('_fedoralink_model', '_fedora_parent'):
            fld2id[extra_fld] = extra_fld
            id2fld[extra_fld] = extra_fld
            id2fldlang[extra_fld] = extra_fld

        all_fields = set()
        self._get_all_fields(query, all_fields, fld2id)

        if query:
            query_tree = self._build_query(query, fld2id, None)
        else:
            query_tree = {"bool": {
                "must": {
                    "match_all": {}
                }
            }
            }
        query_tree = { "bool" : {
            "must" : [
                query_tree,
                self._build_query(Q(_fedoralink_model=self._get_elastic_class(model_class)), fld2id, None)
            ]
        }}

        ordering_clause = self._generate_ordering_clause(fld2id, ordering)

        facets_clause = self._generate_facet_clause(facets, fld2id)

        built_query = {
            "sort": ordering_clause,
            "query": query_tree,
            "aggs": facets_clause,
            "highlight": {
                "fields": {
                    k: {} for k in all_fields
                    # '*' : {}
                    },
                "require_field_match": False
            },
            "from": start if start else 0,
            "size": (end - (start if start else 0)) if end is not None else 10000
        }

        print(json.dumps(built_query, ensure_ascii=False))
        do_profile = FedoraProfillingMiddleware.profilling_enabled()
        if do_profile:
            t1 = time.time()

        resp = self.es.search(body=built_query)

        if do_profile:
            t2 = time.time()
            FedoraProfillingMiddleware.log_time(json.dumps(built_query ,ensure_ascii=False), t2-t1)

        # print(json.dumps(resp, indent=4))

        instances = []
        for doc in resp['hits']['hits']:
            if values is None:
                instances.append(self.build_instance(doc, id2fld))

        facets = []
        for k, v in resp.get('aggregations', {}).items():
            if 'buckets' in v:
                # normal value
                buckets = v['buckets']
            else:
                # nested value, always called "value" - defined above
                buckets = v['value']['buckets']

            if k.endswith('__exists'):
                facet_id = id2fldlang[k[:-8]] + '__exists'
            else:
                facet_id = id2fldlang[k]

            facets.append((
                facet_id,
                [(vv['key'], vv['doc_count']) for vv in buckets]
            ))

        return {
            'count': resp['hits']['total'],
            'data': iter(instances),
            'facets': facets
        }
    def handle(self, *args, **options):

        FedoraTypeManager.populate()

        models = list(args)

        for model_name in models:
            fields = {}

            split_model_name  = model_name.split('.')
            indexer_model_name = '_'.join(split_model_name)
            module_name = '.'.join(split_model_name[:-1])
            split_model_name  = split_model_name[-1]
            class_for_name(module_name, split_model_name)
            modelclz = FedoraTypeManager.get_model_class(split_model_name)

            for field in modelclz._meta.fields:
                fldname = url2id(field.rdf_name)
                if fldname not in fields:
                    fields[fldname] = field

            indexer = connections['repository'].indexer

            existing_mapping = indexer.get_mapping(indexer_model_name)
            existing_properties = existing_mapping.get('properties', {})

            new_mapping = {
            }

            new_properties = {
            }

            fields['_fedora_id']            = FEDORA_ID_FIELD
            fields['_fedora_parent']        = FEDORA_PARENT_FIELD
            fields['_fedora_type']          = FEDORA_TYPE_FIELD
            fields['_fedoralink_model']     = FEDORALINK_TYPE_FIELD
            fields['_fedora_created']       = FEDORA_CREATED_FIELD
            fields['_fedora_last_modified'] = FEDORA_LAST_MODIFIED_FIELD

            for fldname, field in fields.items():
                if fldname in existing_properties:
                    continue

                props = {}
                new_properties[fldname] = props

                if isinstance(field, IndexedLanguageField):
                    props['type'] = 'nested'
                    props["include_in_root"] = 'true'
                    props['properties'] = self.gen_languages_mapping(fldname + ".")
                elif isinstance(field, IndexedTextField):
                    props['type']  = 'string'
                    props['index'] = 'not_analyzed'
                    props['copy_to'] = fldname + "__fulltext"
                    new_properties[fldname + "__fulltext"] = {
                        'type': 'string',
                    }
                elif isinstance(field, IndexedDateTimeField):
                    props['type'] = 'date'
                    props['index'] = 'not_analyzed'
                elif isinstance(field, IndexedDateField):
                    props['type'] = 'date'
                    props['index'] = 'not_analyzed'
                elif isinstance(field, IndexedIntegerField):
                    props['type'] = 'long'
                    props['index'] = 'not_analyzed'
                elif isinstance(field, IndexedGPSField):
                    props['type'] = 'string'
                    props['index'] = 'not_analyzed'
                elif isinstance(field, IndexedLinkedField) or isinstance(field, IndexedBinaryField) :
                    props['type'] = 'string'
                    props['index'] = 'not_analyzed'
                else:
                    raise Exception("Mapping type %s not handled yet" % type(field))

            new_mapping['_all'] = {
                "store": True
            }

            new_mapping['properties'] = new_properties
            print(json.dumps(new_mapping, indent=4))

            indexer.save_mapping(indexer_model_name, new_mapping)
    def handle(self, *args, **options):

        FedoraTypeManager.populate()

        models = options['model_name']

        for model_name in models:
            fields = {}

            split_model_name = model_name.split('.')
            indexer_model_name = '_'.join(split_model_name)
            module_name = '.'.join(split_model_name[:-1])
            split_model_name = split_model_name[-1]
            class_for_name(module_name, split_model_name)
            modelclz = FedoraTypeManager.get_model_class(split_model_name)

            for field in modelclz._meta.fields:
                fldname = url2id(field.rdf_name)
                if fldname not in fields:
                    fields[fldname] = field

            indexer = connections['repository'].indexer

            existing_mapping = indexer.get_mapping(indexer_model_name)
            existing_properties = existing_mapping.get('properties', {})

            new_mapping = {}

            new_properties = {}

            fields['_fedora_id'] = FEDORA_ID_FIELD
            fields['_fedora_parent'] = FEDORA_PARENT_FIELD
            fields['_fedora_type'] = FEDORA_TYPE_FIELD
            fields['_fedoralink_model'] = FEDORALINK_TYPE_FIELD
            fields['_fedora_created'] = FEDORA_CREATED_FIELD
            fields['_fedora_last_modified'] = FEDORA_LAST_MODIFIED_FIELD
            fields['_collection_child_types'] = CESNET_RDF_TYPES

            print('ADD fields to mapping')

            for fldname, field in fields.items():
                if fldname in existing_properties:
                    continue

                props = {}
                new_properties[fldname] = props

                if isinstance(field, IndexedLanguageField):
                    props['type'] = 'nested'
                    props["include_in_root"] = 'true'
                    props['properties'] = self.gen_languages_mapping(fldname +
                                                                     ".")
                elif isinstance(field, IndexedTextField):
                    props['type'] = 'keyword'
                    props['copy_to'] = fldname + "__fulltext"
                    new_properties[fldname + "__fulltext"] = {
                        'type': 'text',
                    }
                elif isinstance(field, IndexedDateTimeField):
                    props['type'] = 'date'
                elif isinstance(field, IndexedDateField):
                    props['type'] = 'date'
                elif isinstance(field, IndexedIntegerField):
                    props['type'] = 'long'
                elif isinstance(field, IndexedGPSField):
                    props['type'] = 'keyword'
                elif isinstance(field, IndexedLinkedField) or isinstance(
                        field, IndexedBinaryField):
                    props['type'] = 'keyword'
                else:
                    raise Exception("Mapping type %s not handled yet" %
                                    type(field))

            new_mapping['_all'] = {"store": True}

            new_mapping['properties'] = new_properties
            print(json.dumps(new_mapping, indent=4))

            indexer.save_mapping(indexer_model_name, new_mapping)
Пример #6
0
    def search(self, query, model_class, start, end, facets, ordering, values):
        self._de_morgan(query)
        self._flatten_query(query)

        fld2id = {}
        id2fld = {}
        id2fldlang = {}
        for fld in model_class._meta.fields:
            id_in_elasticsearch = url2id(fld.rdf_name)

            if isinstance(fld, IndexedLanguageField):
                for lang in settings.LANGUAGES:
                    nested_id_in_elasticsearch = id_in_elasticsearch + '.' + lang[0]

                    fld2id[fld.name + '.' + lang[0]] = nested_id_in_elasticsearch
                    id2fld[nested_id_in_elasticsearch] = fld.name
                    id2fldlang[nested_id_in_elasticsearch] = fld.name + '@' + lang[0]

            fld2id[fld.name] = id_in_elasticsearch
            id2fld[id_in_elasticsearch] = fld.name
            id2fldlang[id_in_elasticsearch] = fld.name

        for extra_fld in ('_fedoralink_model', '_fedora_parent'):
            fld2id[extra_fld] = extra_fld
            id2fld[extra_fld] = extra_fld
            id2fldlang[extra_fld] = extra_fld

        all_fields = set()
        self._get_all_fields(query, all_fields, fld2id)

        filters = []
        fulltext_matches = []

        if query:
            if query.connector != 'AND':
                raise NotImplementedError("Only top-level AND connector is implemented now")

            for c in query.children:
                if self._is_filter(c):
                    filters.append(c)
                else:
                    fulltext_matches.append(c)

            filters.append(Q(_fedoralink_model=self._get_elastic_class(model_class)))

            f = Q()
            f.connector = Q.AND
            f.children = filters
            filters = f

            filters = self._build_filter(filters, fld2id, None)

            f = Q()
            f.connector = Q.AND
            f.children = fulltext_matches
            fulltext_matches = f

            fulltext_matches = self._build_fulltext(fulltext_matches, fld2id, None)
        else:
            filters = Q(_fedoralink_model=self._get_elastic_class(model_class))
            filters = self._build_filter(filters, fld2id, None)
            fulltext_matches = {}

        ordering_clause = self._generate_ordering_clause(fld2id, ordering)

        facets_clause = self._generate_facet_clause(facets, fld2id)

        built_query = {}
        if filters:
            built_query['filter'] = {'bool': filters.get('bool', [])}

        if fulltext_matches:
            built_query['query'] = {
                'bool': fulltext_matches.get('bool', [])
            }

        built_query = {
            "sort": ordering_clause,
            "query": {
                "filtered": built_query
            },
            "aggs": facets_clause,
            "highlight": {
                "fields": {
                    k: {} for k in all_fields
                    # '*' : {}
                },
                "require_field_match": False
            },
            "from": start if start else 0,
            "size": (end - (start if start else 0)) if end is not None else 10000
        }

        print(json.dumps(built_query, indent=4))

        resp = self.es.search(body=built_query)

        # print(json.dumps(resp, indent=4))

        instances = []
        for doc in resp['hits']['hits']:
            if values is None:
                instances.append(self.build_instance(doc, id2fld))

        facets = []
        for k, v in resp.get('aggregations', {}).items():
            if 'buckets' in v:
                # normal value
                buckets = v['buckets']
            else:
                # nested value, always called "value" - defined above
                buckets = v['value']['buckets']

            facets.append((
                id2fldlang[k],
                [(vv['key'], vv['doc_count']) for vv in buckets]
            ))

        return {
            'count': resp['hits']['total'],
            'data': iter(instances),
            'facets': facets
        }