Exemplo n.º 1
0
class DatasetSearch(ModelSearchAdapter):
    model = Dataset
    fuzzy = True
    mapping = {
        'properties': {
            'title': {
                'type': 'string',
                'analyzer': i18n_analyzer,
                'fields': {
                    'raw': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            },
            'description': {
                'type': 'string',
                'analyzer': i18n_analyzer
            },
            'license': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            'frequency': {
                'type': 'string'
            },
            'organization': {
                'type': 'string'
            },
            'owner': {
                'type': 'string'
            },
            'supplier': {
                'type': 'string'
            },
            'tags': {
                'type': 'string',
                'index_name': 'tag',
                'index': 'not_analyzed'
            },
            'tag_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'resources': {
                'type': 'object',
                'index_name': 'resource',
                'properties': {
                    'title': {
                        'type': 'string'
                    },
                    'description': {
                        'type': 'string'
                    },
                    'license': {
                        'type': 'string'
                    },
                }
            },
            'format_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'dataset_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': True,
            },
            'created': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'last_modified': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'metrics': metrics_mapping(Dataset),
            'featured': {
                'type': 'boolean'
            },
            'temporal_coverage':
            {  # Store dates as ordinals to handle pre-1900 dates
                'type': 'object',
                'properties': {
                    'start': {
                        'type': 'long'
                    },
                    'end': {
                        'type': 'long'
                    },
                }
            },
            'territories': {
                'type': 'object',
                'index_name': 'territories',
                'properties': {
                    'id': {
                        'type': 'string'
                    },
                    'name': {
                        'type': 'string'
                    },
                    'code': {
                        'type': 'string'
                    },
                }
            },
            'granularity': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            # 'geom': {
            #     'type': 'geo_shape',
            #     'precision': '100m',
            # },
            'extras': {
                'type': 'object',
                'index_name': 'extra',
            },
        }
    }
    fields = (
        'title^6',
        'tags^3',
        'territories.name^3',
        'description',
        'code',
    )
    sorts = {
        'title': Sort('title.raw'),
        'created': Sort('created'),
        'last_modified': Sort('last_modified'),
        'reuses': Sort('metrics.reuses'),
        'followers': Sort('metrics.followers'),
        'views': Sort('metrics.views'),
    }
    facets = {
        'tag':
        TermFacet('tags'),
        'organization':
        ModelTermFacet('organization', Organization),
        'owner':
        ModelTermFacet('owner', User),
        'supplier':
        ModelTermFacet('supplier', Organization),
        'license':
        ModelTermFacet('license', License),
        'territory':
        ModelTermFacet('territories.id', Territory),
        'granularity':
        TermFacet('granularity', lambda l, v: SPATIAL_GRANULARITIES[v]),
        'format':
        TermFacet('resources.format'),
        'reuses':
        RangeFacet('metrics.reuses'),
        'temporal_coverage':
        TemporalCoverageFacet('temporal_coverage'),
        'featured':
        BoolFacet('featured'),
        'extra':
        ExtrasFacet('extras'),
    }
    boosters = [
        BoolBooster('featured', 1.1),
        BoolBooster('from_public_service', 1.3),
        GaussDecay('metrics.reuses', max_reuses, decay=0.8),
        GaussDecay('metrics.followers',
                   max_followers,
                   max_followers,
                   decay=0.8),
    ]

    @classmethod
    def is_indexable(cls, dataset):
        return dataset.deleted is None and len(
            dataset.resources) > 0 and not dataset.private

    @classmethod
    def serialize(cls, dataset):
        org_id = str(dataset.organization.id
                     ) if dataset.organization is not None else None
        supplier_id = str(
            dataset.supplier.id) if dataset.supplier is not None else None
        supplier_id = supplier_id if supplier_id != org_id else None
        if dataset.organization:
            image_url = dataset.organization.logo(40)
        elif dataset.owner:
            image_url = dataset.owner.avatar(40)
        else:
            image_url = None

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            dataset.license.id if dataset.license is not None else None,
            'tags':
            dataset.tags,
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            org_id,
            'owner':
            str(dataset.owner.id) if dataset.owner else None,
            'supplier':
            supplier_id,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title),
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'extras':
            dataset.extras,
            'featured':
            dataset.featured,
            'from_public_service':
            dataset.organization.public_service if dataset.organization else
            False,  # TODO: extract tis into plugin
        }
        if dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end:
            document.update({
                'temporal_coverage': {
                    'start': dataset.temporal_coverage.start.toordinal(),
                    'end': dataset.temporal_coverage.end.toordinal(),
                }
            })

        if dataset.spatial is not None:
            document.update({
                'territories': [{
                    'id': str(t.id),
                    'name': t.name,
                    'code': t.code
                } for t in dataset.spatial.territories],
                # 'geom': dataset.spatial.geom,
                'granularity':
                dataset.spatial.granularity,
            })

        return document
Exemplo n.º 2
0
class DatasetSearch(ModelSearchAdapter):
    model = Dataset
    fuzzy = True
    exclude_fields = ['spatial.geom', 'spatial.zones.geom']

    class Meta:
        doc_type = 'Dataset'

    title = String(analyzer=i18n_analyzer,
                   fields={'raw': String(index='not_analyzed')})
    description = String(analyzer=i18n_analyzer)
    license = String(index='not_analyzed')
    frequency = String(index='not_analyzed')
    organization = String(index='not_analyzed')
    owner = String(index='not_analyzed')
    tags = String(index='not_analyzed',
                  fields={'i18n': String(index='not_analyzed')})
    badges = String(index='not_analyzed')
    tag_suggest = Completion(analyzer=simple,
                             search_analyzer=simple,
                             payloads=False)
    resources = Object(
        properties={
            'title': String(),
            'description': String(),
            'format': String(index='not_analyzed')
        })
    format_suggest = Completion(analyzer=simple,
                                search_analyzer=simple,
                                payloads=False)
    dataset_suggest = Completion(analyzer=simple,
                                 search_analyzer=simple,
                                 payloads=True)
    created = Date(format='date_hour_minute_second')
    last_modified = Date(format='date_hour_minute_second')
    metrics = metrics_mapping_for(Dataset)
    featured = Boolean()
    temporal_coverage = Nested(multi=False,
                               properties={
                                   'start': Long(),
                                   'end': Long()
                               })
    temporal_weight = Long(),
    geozones = Object(
        properties={
            'id': String(index='not_analyzed'),
            'name': String(index='not_analyzed'),
            'keys': String(index='not_analyzed')
        })
    granularity = String(index='not_analyzed')
    spatial_weight = Long()
    from_certified = Boolean()

    fields = (
        'geozones.keys^9',
        'geozones.name^9',
        'acronym^7',
        'title^6',
        'tags.i18n^3',
        'description',
    )
    sorts = {
        'title': 'title.raw',
        'created': 'created',
        'last_modified': 'last_modified',
        'reuses': 'metrics.reuses',
        'followers': 'metrics.followers',
        'views': 'metrics.views',
    }

    facets = {
        'tag':
        TermsFacet(field='tags'),
        'badge':
        TermsFacet(field='badges', labelizer=dataset_badge_labelizer),
        'organization':
        ModelTermsFacet(field='organization', model=Organization),
        'owner':
        ModelTermsFacet(field='owner', model=User),
        'license':
        ModelTermsFacet(field='license', model=License),
        'geozone':
        ModelTermsFacet(field='geozones.id',
                        model=GeoZone,
                        labelizer=zone_labelizer),
        'granularity':
        TermsFacet(field='granularity', labelizer=granularity_labelizer),
        'format':
        TermsFacet(field='resources.format'),
        'reuses':
        RangeFacet(field='metrics.reuses',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('quite', (5, 10)), ('many', (10, None))],
                   labels={
                       'none': _('Never reused'),
                       'few': _('Little reused'),
                       'quite': _('Quite reused'),
                       'many': _('Heavily reused'),
                   }),
        'temporal_coverage':
        TemporalCoverageFacet(field='temporal_coverage'),
        'featured':
        BoolFacet(field='featured'),
    }
    boosters = [
        BoolBooster('featured', 1.5),
        BoolBooster('from_certified', 1.2),
        ValueFactor('spatial_weight', missing=1),
        ValueFactor('temporal_weight', missing=1),
        GaussDecay('metrics.reuses', max_reuses, decay=0.1),
        GaussDecay('metrics.followers',
                   max_followers,
                   max_followers,
                   decay=0.1),
    ]

    @classmethod
    def is_indexable(cls, dataset):
        return (dataset.deleted is None and len(dataset.resources) > 0
                and not dataset.private)

    @classmethod
    def get_suggest_weight(cls, temporal_weight, spatial_weight, featured):
        '''Compute the suggest part of the indexation payload'''
        featured_weight = 1 if not featured else FEATURED_WEIGHT
        return temporal_weight * spatial_weight * featured_weight

    @classmethod
    def serialize(cls, dataset):
        organization = None
        owner = None
        image_url = None
        spatial_weight = DEFAULT_SPATIAL_WEIGHT
        temporal_weight = DEFAULT_TEMPORAL_WEIGHT

        if dataset.organization:
            organization = Organization.objects(
                id=dataset.organization.id).first()
            image_url = organization.logo(40, external=True)
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()
            image_url = owner.avatar(40, external=True)

        certified = organization and organization.certified

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            getattr(dataset.license, 'id', None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            str(organization.id) if organization else None,
            'owner':
            str(owner.id) if owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'acronym': dataset.acronym,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'featured':
            dataset.featured,
            'from_certified':
            certified,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            start = dataset.temporal_coverage.start.toordinal()
            end = dataset.temporal_coverage.end.toordinal()
            temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT)
            document.update({
                'temporal_coverage': {
                    'start': start,
                    'end': end
                },
                'temporal_weight': temporal_weight,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])

            spatial_weight = ADMIN_LEVEL_MAX / coverage_level
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
                'spatial_weight': spatial_weight,
            })

        document['dataset_suggest']['weight'] = cls.get_suggest_weight(
            temporal_weight, spatial_weight, dataset.featured)

        if dataset.acronym:
            document['dataset_suggest']['input'].append(dataset.acronym)

        return document
Exemplo n.º 3
0
class DatasetSearch(ModelSearchAdapter):
    model = Dataset
    fuzzy = True
    mapping = {
        'properties': {
            'title': {
                'type': 'string',
                'analyzer': i18n_analyzer,
                'fields': {
                    'raw': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            },
            'description': {
                'type': 'string',
                'analyzer': i18n_analyzer
            },
            'license': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            'frequency': {
                'type': 'string'
            },
            'organization': {
                'type': 'string'
            },
            'owner': {
                'type': 'string'
            },
            'tags': {
                'type': 'string',
                'index_name': 'tag',
                'index': 'not_analyzed',
                'fields': {
                    'i18n': {
                        'type': 'string',
                        'analyzer': i18n_analyzer
                    }
                }
            },
            'badges': {
                'type': 'string',
                'index_name': 'badges',
                'index': 'not_analyzed'
            },
            'tag_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'resources': {
                'type': 'object',
                'index_name': 'resource',
                'properties': {
                    'title': {
                        'type': 'string'
                    },
                    'description': {
                        'type': 'string'
                    },
                    'license': {
                        'type': 'string'
                    },
                }
            },
            'format_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'dataset_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': True,
            },
            'created': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'last_modified': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'metrics': metrics_mapping(Dataset),
            'featured': {
                'type': 'boolean'
            },
            # Store dates as ordinals to handle pre-1900 dates.
            'temporal_coverage': {
                'type': 'object',
                'properties': {
                    'start': {
                        'type': 'long'
                    },
                    'end': {
                        'type': 'long'
                    },
                }
            },
            'geozones': {
                'type': 'object',
                'index_name': 'geozones',
                'properties': {
                    'id': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'name': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'keys': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                }
            },
            'granularity': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            # 'geom': {
            #     'type': 'geo_shape',
            #     'precision': '100m',
            # },
            'extras': {
                'type': 'object',
                'index_name': 'extra',
            },
        }
    }
    fields = (
        'geozones.keys^9',
        'geozones.name^9',
        'title^6',
        'tags.i18n^3',
        'description',
    )
    sorts = {
        'title': Sort('title.raw'),
        'created': Sort('created'),
        'last_modified': Sort('last_modified'),
        'reuses': Sort('metrics.reuses'),
        'followers': Sort('metrics.followers'),
        'views': Sort('metrics.views'),
    }
    facets = {
        'tag': TermFacet('tags'),
        'badge': TermFacet('badges', labelizer=dataset_badge_labelizer),
        'organization': ModelTermFacet('organization', Organization),
        'owner': ModelTermFacet('owner', User),
        'license': ModelTermFacet('license', License),
        'geozone': ModelTermFacet('geozones.id', GeoZone, zone_labelizer),
        'granularity': TermFacet('granularity', granularity_labelizer),
        'format': TermFacet('resources.format'),
        'reuses': RangeFacet('metrics.reuses'),
        'temporal_coverage': TemporalCoverageFacet('temporal_coverage'),
        'featured': BoolFacet('featured'),
        'extra': ExtrasFacet('extras'),
    }
    boosters = [
        BoolBooster('featured', 1.1),
        GaussDecay('metrics.reuses', max_reuses, decay=0.1),
        GaussDecay('metrics.followers',
                   max_followers,
                   max_followers,
                   decay=0.1),
    ]

    @classmethod
    def is_indexable(cls, dataset):
        return (dataset.deleted is None and len(dataset.resources) > 0
                and not dataset.private)

    @classmethod
    def serialize(cls, dataset):
        org_id = (str(dataset.organization.id)
                  if dataset.organization is not None else None)
        if dataset.organization:
            image_url = dataset.organization.logo(40)
        elif dataset.owner:
            image_url = dataset.owner.avatar(40)
        else:
            image_url = None

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            (dataset.license.id if dataset.license is not None else None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            org_id,
            'owner':
            str(dataset.owner.id) if dataset.owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'extras':
            dataset.extras,
            'featured':
            dataset.featured,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            document.update({
                'temporal_coverage': {
                    'start': dataset.temporal_coverage.start.toordinal(),
                    'end': dataset.temporal_coverage.end.toordinal(),
                }
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zones = GeoZone.objects(
                id__in=[z.id for z in dataset.spatial.zones])
            parents = set()
            geozones = []
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)

            geozones.extend([{'id': p} for p in parents])

            document.update({
                'geozones': geozones,
                # 'geom': dataset.spatial.geom,
                'granularity': dataset.spatial.granularity,
            })

        return document