예제 #1
0
 def test_map_metrics(self):
     mapping = search.metrics_mapping_for(Fake)
     self.assert_dict_equal(mapping, {
         'type': 'object',
         'properties': {
             'fake-metric-int': {
                 'type': 'integer',
             },
             'fake-metric-float': {
                 'type': 'float',
             },
         }
     })
예제 #2
0
 def test_map_metrics(self):
     mapping = search.metrics_mapping_for(Fake)
     assert_json_equal(mapping, {
         'type': 'object',
         'properties': {
             'fake-metric-int': {
                 'type': 'integer',
             },
             'fake-metric-float': {
                 'type': 'float',
             },
         }
     })
예제 #3
0
파일: search.py 프로젝트: koumoul-dev/udata
class DatasetSearch(ModelSearchAdapter):
    model = Dataset
    fuzzy = True
    exclude_fields = ['spatial.geom', 'spatial.zones.geom']

    class Meta:
        doc_type = 'Dataset'

    title = String(analyzer=i18n_analyzer,
                   fields={'raw': String(index='not_analyzed')})
    description = String(analyzer=i18n_analyzer)
    license = String(index='not_analyzed')
    frequency = String(index='not_analyzed')
    organization = String(index='not_analyzed')
    owner = String(index='not_analyzed')
    tags = String(index='not_analyzed',
                  fields={'i18n': String(index='not_analyzed')})
    badges = String(index='not_analyzed')
    tag_suggest = Completion(analyzer=simple,
                             search_analyzer=simple,
                             payloads=False)
    resources = Object(
        properties={
            'title': String(),
            'description': String(),
            'format': String(index='not_analyzed')
        })
    format_suggest = Completion(analyzer=simple,
                                search_analyzer=simple,
                                payloads=False)
    dataset_suggest = Completion(analyzer=simple,
                                 search_analyzer=simple,
                                 payloads=True)
    created = Date(format='date_hour_minute_second')
    last_modified = Date(format='date_hour_minute_second')
    metrics = metrics_mapping_for(Dataset)
    featured = Boolean()
    temporal_coverage = Nested(multi=False,
                               properties={
                                   'start': Long(),
                                   'end': Long()
                               })
    temporal_weight = Long(),
    geozones = Object(
        properties={
            'id': String(index='not_analyzed'),
            'name': String(index='not_analyzed'),
            'keys': String(index='not_analyzed')
        })
    granularity = String(index='not_analyzed')
    spatial_weight = Long()
    from_certified = Boolean()

    fields = (
        'geozones.keys^9',
        'geozones.name^9',
        'acronym^7',
        'title^6',
        'tags.i18n^3',
        'description',
    )
    sorts = {
        'title': 'title.raw',
        'created': 'created',
        'last_modified': 'last_modified',
        'reuses': 'metrics.reuses',
        'followers': 'metrics.followers',
        'views': 'metrics.views',
    }

    facets = {
        'tag':
        TermsFacet(field='tags'),
        'badge':
        TermsFacet(field='badges', labelizer=dataset_badge_labelizer),
        'organization':
        ModelTermsFacet(field='organization', model=Organization),
        'owner':
        ModelTermsFacet(field='owner', model=User),
        'license':
        ModelTermsFacet(field='license', model=License),
        'geozone':
        ModelTermsFacet(field='geozones.id',
                        model=GeoZone,
                        labelizer=zone_labelizer),
        'granularity':
        TermsFacet(field='granularity', labelizer=granularity_labelizer),
        'format':
        TermsFacet(field='resources.format'),
        'reuses':
        RangeFacet(field='metrics.reuses',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('quite', (5, 10)), ('many', (10, None))],
                   labels={
                       'none': _('Never reused'),
                       'few': _('Little reused'),
                       'quite': _('Quite reused'),
                       'many': _('Heavily reused'),
                   }),
        'temporal_coverage':
        TemporalCoverageFacet(field='temporal_coverage'),
        'featured':
        BoolFacet(field='featured'),
    }
    boosters = [
        BoolBooster('featured', 1.5),
        BoolBooster('from_certified', 1.2),
        ValueFactor('spatial_weight', missing=1),
        ValueFactor('temporal_weight', missing=1),
        GaussDecay('metrics.reuses', max_reuses, decay=0.1),
        GaussDecay('metrics.followers',
                   max_followers,
                   max_followers,
                   decay=0.1),
    ]

    @classmethod
    def is_indexable(cls, dataset):
        return (dataset.deleted is None and len(dataset.resources) > 0
                and not dataset.private)

    @classmethod
    def get_suggest_weight(cls, temporal_weight, spatial_weight, featured):
        '''Compute the suggest part of the indexation payload'''
        featured_weight = 1 if not featured else FEATURED_WEIGHT
        return temporal_weight * spatial_weight * featured_weight

    @classmethod
    def serialize(cls, dataset):
        organization = None
        owner = None
        image_url = None
        spatial_weight = DEFAULT_SPATIAL_WEIGHT
        temporal_weight = DEFAULT_TEMPORAL_WEIGHT

        if dataset.organization:
            organization = Organization.objects(
                id=dataset.organization.id).first()
            image_url = organization.logo(40, external=True)
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()
            image_url = owner.avatar(40, external=True)

        certified = organization and organization.certified

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            getattr(dataset.license, 'id', None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            str(organization.id) if organization else None,
            'owner':
            str(owner.id) if owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'acronym': dataset.acronym,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'featured':
            dataset.featured,
            'from_certified':
            certified,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            start = dataset.temporal_coverage.start.toordinal()
            end = dataset.temporal_coverage.end.toordinal()
            temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT)
            document.update({
                'temporal_coverage': {
                    'start': start,
                    'end': end
                },
                'temporal_weight': temporal_weight,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])

            spatial_weight = ADMIN_LEVEL_MAX / coverage_level
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
                'spatial_weight': spatial_weight,
            })

        document['dataset_suggest']['weight'] = cls.get_suggest_weight(
            temporal_weight, spatial_weight, dataset.featured)

        if dataset.acronym:
            document['dataset_suggest']['input'].append(dataset.acronym)

        return document
예제 #4
0
파일: search.py 프로젝트: amagovpt/udata
class UserSearch(ModelSearchAdapter):
    model = User
    fuzzy = True

    class Meta:
        doc_type = 'User'

    first_name = String()
    last_name = String()
    about = String(analyzer=i18n_analyzer)
    organizations = String(index='not_analyzed')
    visible = Boolean()
    metrics = metrics_mapping_for(User)
    created = Date(format='date_hour_minute_second')
    user_suggest = Completion(analyzer=simple,
                              search_analyzer=simple,
                              payloads=True)

    sorts = {
        'last_name': 'last_name',
        'first_name': 'first_name',
        'datasets': 'metrics.datasets',
        'reuses': 'metrics.reuses',
        'followers': 'metrics.followers',
        'views': 'metrics.views',
        'created': 'created',
    }
    facets = {
        'organization':
        ModelTermsFacet(field='organizations', model=Organization),
        'datasets':
        RangeFacet(field='metrics.datasets',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('many', (5, None))],
                   labels={
                       'none': _('No datasets'),
                       'few': _('Few datasets'),
                       'many': _('Many datasets'),
                   }),
        'followers':
        RangeFacet(field='metrics.followers',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('many', (5, None))],
                   labels={
                       'none': _('No followers'),
                       'few': _('Few followers'),
                       'many': _('Many followers'),
                   }),
    }
    boosters = [
        GaussDecay('metrics.reuses', 50, decay=0.8),
        GaussDecay('metrics.datasets', 50, decay=0.8),
        GaussDecay('metrics.followers', 200, 200, decay=0.8),
    ]

    @classmethod
    def serialize(cls, user):
        return {
            'first_name': user.first_name,
            'last_name': user.last_name,
            'about': user.about,
            'organizations': [str(o.id) for o in user.organizations],
            'metrics': user.metrics,
            'created': to_iso_datetime(user.created_at),
            'user_suggest': {
                'input': cls.completer_tokenize(user.fullname) + [user.id],
                'output': str(user.id),
                'payload': {
                    'avatar_url': user.avatar(40, external=True),
                    'first_name': user.first_name,
                    'last_name': user.last_name,
                    'slug': user.slug,
                },
            },
            'visible': user.visible,
        }
예제 #5
0
파일: search.py 프로젝트: seiteta/udata
class ReuseSearch(ModelSearchAdapter):
    model = Reuse
    fuzzy = True

    class Meta:
        doc_type = 'Reuse'

    title = String(analyzer=i18n_analyzer, fields={
        'raw': String(index='not_analyzed')
    })
    description = String(analyzer=i18n_analyzer)
    url = String(index='not_analyzed')
    organization = String(index='not_analyzed')
    owner = String(index='not_analyzed')
    type = String(index='not_analyzed')
    tags = String(index='not_analyzed', fields={
        'i18n': String(index='not_analyzed')
    })
    badges = String(index='not_analyzed')
    tag_suggest = Completion(analyzer=simple,
                             search_analyzer=simple,
                             payloads=False)
    datasets = Object(
        properties={
            'id': String(index='not_analyzed'),
            'title': String(),
        }
    )
    created = Date(format='date_hour_minute_second')
    last_modified = Date(format='date_hour_minute_second')
    metrics = metrics_mapping_for(Reuse)
    featured = Boolean()
    reuse_suggest = Completion(analyzer=simple,
                               search_analyzer=simple,
                               payloads=True)
    extras = Object()

    fields = (
        'title^4',
        'description^2',
        'datasets.title',
    )
    facets = {
        'tag': TermsFacet(field='tags'),
        'organization': ModelTermsFacet(field='organization',
                                        model=Organization),
        'owner': ModelTermsFacet(field='owner', model=User),
        'dataset': ModelTermsFacet(field='dataset.id', model=Dataset),
        'type': TermsFacet(field='type', labelizer=reuse_type_labelizer),
        'datasets': RangeFacet(field='metrics.datasets',
                               ranges=[('none', (None, 1)),
                                       ('few', (1, 5)),
                                       ('many', (5, None))],
                               labels={
                                    'none': _('No datasets'),
                                    'few': _('Few datasets'),
                                    'many': _('Many datasets'),
                               }),
        'followers': RangeFacet(field='metrics.followers',
                                ranges=[('none', (None, 1)),
                                        ('few', (1, 5)),
                                        ('many', (5, None))],
                                labels={
                                     'none': _('No followers'),
                                     'few': _('Few followers'),
                                     'many': _('Many followers'),
                                }),
        'badge': TermsFacet(field='badges', labelizer=reuse_badge_labelizer),
        'featured': BoolFacet(field='featured'),
    }
    sorts = {
        'title': 'title.raw',
        'created': 'created',
        'last_modified': 'last_modified',
        'datasets': 'metrics.datasets',
        'followers': 'metrics.followers',
        'views': 'metrics.views',
    }
    boosters = [
        BoolBooster('featured', 1.1),
        GaussDecay('metrics.datasets', max_datasets, decay=0.8),
        GaussDecay('metrics.followers', max_followers, decay=0.8),
    ]

    @classmethod
    def is_indexable(cls, reuse):
        return (reuse.deleted is None and
                len(reuse.datasets) > 0 and
                not reuse.private)

    @classmethod
    def serialize(cls, reuse):
        """By default use the ``to_dict`` method

        and exclude ``_id``, ``_cls`` and ``owner`` fields.
        """
        return {
            'title': reuse.title,
            'description': reuse.description,
            'url': reuse.url,
            'organization': (str(reuse.organization.id)
                             if reuse.organization else None),
            'owner': str(reuse.owner.id) if reuse.owner else None,
            'type': reuse.type,
            'tags': reuse.tags,
            'tag_suggest': reuse.tags,
            'badges': [badge.kind for badge in reuse.badges],
            'created': reuse.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified': reuse.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'dataset': [{
                'id': str(d.id),
                'title': d.title
            } for d in reuse.datasets if isinstance(d, Dataset)],
            'metrics': reuse.metrics,
            'featured': reuse.featured,
            'extras': reuse.extras,
            'reuse_suggest': {
                'input': cls.completer_tokenize(reuse.title) + [reuse.id],
                'output': str(reuse.id),
                'payload': {
                    'title': reuse.title,
                    'slug': reuse.slug,
                    'image_url': reuse.image(40, external=True),
                },
            },
        }
예제 #6
0
class OrganizationSearch(search.ModelSearchAdapter):
    model = Organization
    fuzzy = True

    class Meta:
        doc_type = 'Organization'

    name = String(analyzer=search.i18n_analyzer,
                  fields={'raw': String(index='not_analyzed')})
    acronym = String(index='not_analyzed')
    description = String(analyzer=search.i18n_analyzer)
    badges = String(index='not_analyzed')
    url = String(index='not_analyzed')
    created = Date(format='date_hour_minute_second')
    metrics = search.metrics_mapping_for(Organization)
    org_suggest = Completion(analyzer=simple,
                             search_analyzer=simple,
                             payloads=True)

    sorts = {
        'name': 'name.raw',
        'reuses': 'metrics.reuses',
        'datasets': 'metrics.datasets',
        'followers': 'metrics.followers',
        'views': 'metrics.views',
        'created': 'created',
        'last_modified': 'last_modified',
    }
    facets = {
        'reuses':
        RangeFacet(field='metrics.reuses',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('many', (5, None))],
                   labels={
                       'none': _('No reuses'),
                       'few': _('Few reuses'),
                       'many': _('Many reuses'),
                   }),
        'badge':
        TermsFacet(field='badges', labelizer=organization_badge_labelizer),
        'datasets':
        RangeFacet(field='metrics.datasets',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('many', (5, None))],
                   labels={
                       'none': _('No datasets'),
                       'few': _('Few datasets'),
                       'many': _('Many datasets'),
                   }),
        'followers':
        RangeFacet(field='metrics.followers',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('many', (5, None))],
                   labels={
                       'none': _('No followers'),
                       'few': _('Few followers'),
                       'many': _('Many followers'),
                   }),
    }
    boosters = [
        search.GaussDecay('metrics.followers',
                          max_followers,
                          decay=lazy('followers_decay')),
        search.GaussDecay('metrics.reuses',
                          max_reuses,
                          decay=lazy('reuses_decay')),
        search.GaussDecay('metrics.datasets',
                          max_datasets,
                          decay=lazy('datasets_decay')),
    ]

    @classmethod
    def is_indexable(cls, org):
        return org.deleted is None

    @classmethod
    def serialize(cls, organization):
        completions = cls.completer_tokenize(organization.name)
        completions.append(organization.id)
        if organization.acronym:
            completions.append(organization.acronym)
        return {
            'name': organization.name,
            'acronym': organization.acronym,
            'description': organization.description,
            'url': organization.url,
            'metrics': organization.metrics,
            'badges': [badge.kind for badge in organization.badges],
            'created': to_iso_datetime(organization.created_at),
            'org_suggest': {
                'input': completions,
                'output': str(organization.id),
                'payload': {
                    'name': organization.name,
                    'acronym': organization.acronym,
                    'image_url': organization.logo(40, external=True),
                    'slug': organization.slug,
                },
            }
        }