Exemplo n.º 1
0
 def test_map_metrics(self):
     mapping = search.metrics_mapping(Fake)
     self.assertEqual(mapping, {
         'type': 'object',
         'index_name': 'metrics',
         'properties': {
             'fake-metric-int': {
                 'type': 'integer',
             },
             'fake-metric-float': {
                 'type': 'float',
             },
         }
     })
Exemplo n.º 2
0
 def test_map_metrics(self):
     mapping = search.metrics_mapping(Fake)
     self.assertEqual(
         mapping, {
             'type': 'object',
             'index_name': 'metrics',
             'properties': {
                 'fake-metric-int': {
                     'type': 'integer',
                 },
                 'fake-metric-float': {
                     'type': 'float',
                 },
             }
         })
Exemplo n.º 3
0
class ReuseSearch(ModelSearchAdapter):
    model = Reuse
    fuzzy = True
    fields = (
        'title^4',
        'description^2',
        'datasets.title',
    )
    facets = {
        'tag': TermFacet('tags'),
        'organization': ModelTermFacet('organization', Organization),
        'owner': ModelTermFacet('owner', User),
        'dataset': ModelTermFacet('dataset.id', Dataset),
        'type': ReuseTypeFacet('type'),
        'datasets': RangeFacet('metrics.datasets'),
        'followers': RangeFacet('metrics.followers'),
        'featured': BoolFacet('featured'),
        'extra': ExtrasFacet('extras'),
        'badge': TermFacet('badges', labelizer=reuse_badge_labelizer),
    }
    sorts = {
        'title': Sort('title.raw'),
        'created': Sort('created'),
        'last_modified': Sort('last_modified'),
        'datasets': Sort('metrics.datasets'),
        'followers': Sort('metrics.followers'),
        'views': Sort('metrics.views'),
    }
    mapping = {
        'properties': {
            'title': {
                'type': 'string',
                'analyzer': i18n_analyzer,
                'fields': {
                    'raw': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            },
            'description': {
                'type': 'string',
                'analyzer': i18n_analyzer
            },
            'url': {
                'type': 'string'
            },
            'organization': {
                'type': 'string'
            },
            'owner': {
                'type': 'string'
            },
            'type': {
                'type': 'string'
            },
            'tags': {
                'type': 'string',
                'index_name': 'tag',
                'index': 'not_analyzed'
            },
            'tag_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'badges': {
                'type': 'string',
                'index_name': 'badges',
                'index': 'not_analyzed'
            },
            'created': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'last_modified': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'dataset': {
                'type': 'object',
                'properties': {
                    'id': {
                        'type': 'string'
                    },
                    'title': {
                        'type': 'string'
                    }
                }
            },
            'metrics': metrics_mapping(Reuse),
            'featured': {
                'type': 'boolean'
            },
            'reuse_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': True,
            },
            'extras': {
                'type': 'object',
                'index_name': 'extra',
            },
        }
    }
    boosters = [
        BoolBooster('featured', 1.1),
        GaussDecay('metrics.datasets', max_datasets, decay=0.8),
        GaussDecay('metrics.followers', max_followers, decay=0.8),
    ]

    @classmethod
    def is_indexable(cls, reuse):
        return (reuse.deleted is None and len(reuse.datasets) > 0
                and not reuse.private)

    @classmethod
    def serialize(cls, reuse):
        """By default use the ``to_dict`` method

        and exclude ``_id``, ``_cls`` and ``owner`` fields.
        """
        return {
            'title':
            reuse.title,
            'description':
            reuse.description,
            'url':
            reuse.url,
            'organization':
            (str(reuse.organization.id) if reuse.organization else None),
            'owner':
            str(reuse.owner.id) if reuse.owner else None,
            'type':
            reuse.type,
            'tags':
            reuse.tags,
            'tag_suggest':
            reuse.tags,
            'badges': [badge.kind for badge in reuse.badges],
            'created':
            reuse.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            reuse.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'dataset': [{
                'id': str(d.id),
                'title': d.title
            } for d in reuse.datasets if isinstance(d, Dataset)],
            'metrics':
            reuse.metrics,
            'featured':
            reuse.featured,
            'extras':
            reuse.extras,
            'reuse_suggest': {
                'input': cls.completer_tokenize(reuse.title) + [reuse.id],
                'output': str(reuse.id),
                'payload': {
                    'title': reuse.title,
                    'slug': reuse.slug,
                    'image_url': reuse.image(40),
                },
            },
        }
Exemplo n.º 4
0
class OrganizationSearch(search.ModelSearchAdapter):
    model = Organization
    fuzzy = True
    fields = (
        'name^6',
        'acronym^6',
        'description',
    )
    sorts = {
        'name': search.Sort('name.raw'),
        'reuses': search.Sort('metrics.reuses'),
        'datasets': search.Sort('metrics.datasets'),
        'followers': search.Sort('metrics.followers'),
        'views': search.Sort('metrics.views'),
        'created': search.Sort('created'),
    }
    facets = {
        'reuses': search.RangeFacet('metrics.reuses'),
        'badge': search.TermFacet('badges',
                                  labelizer=organization_badge_labelizer),
        'permitted_reuses': search.RangeFacet('metrics.permitted_reuses'),
        'datasets': search.RangeFacet('metrics.datasets'),
        'followers': search.RangeFacet('metrics.followers'),
    }
    mapping = {
        'properties': {
            'name': {
                'type': 'string',
                'analyzer': search.i18n_analyzer,
                'fields': {
                    'raw': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            },
            'acronym': {
                'type': 'string',
                'index': 'not_analyzed',
            },
            'description': {
                'type': 'string',
                'analyzer': search.i18n_analyzer
            },
            'badges': {
                'type': 'string',
                'index_name': 'badges',
                'index': 'not_analyzed'
            },
            'url': {
                'type': 'string'
            },
            'created': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'metrics': search.metrics_mapping(Organization),
            'org_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': True,
            },
        }
    }
    boosters = [
        search.GaussDecay('metrics.followers', max_followers, decay=0.8),
        search.GaussDecay('metrics.reuses', max_reuses, decay=0.9),
        search.GaussDecay('metrics.datasets', max_datasets, decay=0.9),
    ]

    @classmethod
    def is_indexable(cls, org):
        return org.deleted is None

    @classmethod
    def serialize(cls, organization):
        completions = cls.completer_tokenize(organization.name)
        completions.append(organization.id)
        if organization.acronym:
            completions.append(organization.acronym)
        return {
            'name': organization.name,
            'acronym': organization.acronym,
            'description': organization.description,
            'url': organization.url,
            'metrics': organization.metrics,
            'badges': [badge.kind for badge in organization.badges],
            'created': organization.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'org_suggest': {
                'input': completions,
                'output': str(organization.id),
                'payload': {
                    'name': organization.name,
                    'acronym': organization.acronym,
                    'image_url': organization.logo(40),
                    'slug': organization.slug,
                },
            }
        }
Exemplo n.º 5
0
class DatasetSearch(ModelSearchAdapter):
    model = Dataset
    fuzzy = True
    mapping = {
        'properties': {
            'title': {
                'type': 'string',
                'analyzer': i18n_analyzer,
                'fields': {
                    'raw': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            },
            'description': {
                'type': 'string',
                'analyzer': i18n_analyzer
            },
            'license': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            'frequency': {
                'type': 'string'
            },
            'organization': {
                'type': 'string'
            },
            'owner': {
                'type': 'string'
            },
            'supplier': {
                'type': 'string'
            },
            'tags': {
                'type': 'string',
                'index_name': 'tag',
                'index': 'not_analyzed'
            },
            'tag_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'resources': {
                'type': 'object',
                'index_name': 'resource',
                'properties': {
                    'title': {
                        'type': 'string'
                    },
                    'description': {
                        'type': 'string'
                    },
                    'license': {
                        'type': 'string'
                    },
                }
            },
            'format_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'dataset_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': True,
            },
            'created': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'last_modified': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'metrics': metrics_mapping(Dataset),
            'featured': {
                'type': 'boolean'
            },
            'temporal_coverage':
            {  # Store dates as ordinals to handle pre-1900 dates
                'type': 'object',
                'properties': {
                    'start': {
                        'type': 'long'
                    },
                    'end': {
                        'type': 'long'
                    },
                }
            },
            'territories': {
                'type': 'object',
                'index_name': 'territories',
                'properties': {
                    'id': {
                        'type': 'string'
                    },
                    'name': {
                        'type': 'string'
                    },
                    'code': {
                        'type': 'string'
                    },
                }
            },
            'granularity': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            # 'geom': {
            #     'type': 'geo_shape',
            #     'precision': '100m',
            # },
            'extras': {
                'type': 'object',
                'index_name': 'extra',
            },
        }
    }
    fields = (
        'title^6',
        'tags^3',
        'territories.name^3',
        'description',
        'code',
    )
    sorts = {
        'title': Sort('title.raw'),
        'created': Sort('created'),
        'last_modified': Sort('last_modified'),
        'reuses': Sort('metrics.reuses'),
        'followers': Sort('metrics.followers'),
        'views': Sort('metrics.views'),
    }
    facets = {
        'tag':
        TermFacet('tags'),
        'organization':
        ModelTermFacet('organization', Organization),
        'owner':
        ModelTermFacet('owner', User),
        'supplier':
        ModelTermFacet('supplier', Organization),
        'license':
        ModelTermFacet('license', License),
        'territory':
        ModelTermFacet('territories.id', Territory),
        'granularity':
        TermFacet('granularity', lambda l, v: SPATIAL_GRANULARITIES[v]),
        'format':
        TermFacet('resources.format'),
        'reuses':
        RangeFacet('metrics.reuses'),
        'temporal_coverage':
        TemporalCoverageFacet('temporal_coverage'),
        'featured':
        BoolFacet('featured'),
        'extra':
        ExtrasFacet('extras'),
    }
    boosters = [
        BoolBooster('featured', 1.1),
        BoolBooster('from_public_service', 1.3),
        GaussDecay('metrics.reuses', max_reuses, decay=0.8),
        GaussDecay('metrics.followers',
                   max_followers,
                   max_followers,
                   decay=0.8),
    ]

    @classmethod
    def is_indexable(cls, dataset):
        return dataset.deleted is None and len(
            dataset.resources) > 0 and not dataset.private

    @classmethod
    def serialize(cls, dataset):
        org_id = str(dataset.organization.id
                     ) if dataset.organization is not None else None
        supplier_id = str(
            dataset.supplier.id) if dataset.supplier is not None else None
        supplier_id = supplier_id if supplier_id != org_id else None
        if dataset.organization:
            image_url = dataset.organization.logo(40)
        elif dataset.owner:
            image_url = dataset.owner.avatar(40)
        else:
            image_url = None

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            dataset.license.id if dataset.license is not None else None,
            'tags':
            dataset.tags,
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            org_id,
            'owner':
            str(dataset.owner.id) if dataset.owner else None,
            'supplier':
            supplier_id,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title),
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'extras':
            dataset.extras,
            'featured':
            dataset.featured,
            'from_public_service':
            dataset.organization.public_service if dataset.organization else
            False,  # TODO: extract tis into plugin
        }
        if dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end:
            document.update({
                'temporal_coverage': {
                    'start': dataset.temporal_coverage.start.toordinal(),
                    'end': dataset.temporal_coverage.end.toordinal(),
                }
            })

        if dataset.spatial is not None:
            document.update({
                'territories': [{
                    'id': str(t.id),
                    'name': t.name,
                    'code': t.code
                } for t in dataset.spatial.territories],
                # 'geom': dataset.spatial.geom,
                'granularity':
                dataset.spatial.granularity,
            })

        return document
Exemplo n.º 6
0
class UserSearch(ModelSearchAdapter):
    model = User
    fuzzy = True
    # analyzer = 'not_analyzed'

    mapping = {
        'properties': {
            'first_name': {'type': 'string'},
            'last_name': {'type': 'string'},
            'about': {'type': 'string', 'analyzer': i18n_analyzer},
            'organizations': {'type': 'string', 'index_name': 'organization'},
            'visible': {'type': 'boolean'},
            'metrics': metrics_mapping(User),
            'created': {'type': 'date', 'format': 'date_hour_minute_second'},
            'user_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': True,
            },
        }
    }

    fields = (
        'last_name^6',
        'first_name^5',
        'about'
    )
    sorts = {
        'last_name': Sort('last_name'),
        'first_name': Sort('first_name'),
        'datasets': Sort('metrics.datasets'),
        'reuses': Sort('metrics.reuses'),
        'followers': Sort('metrics.followers'),
        'views': Sort('metrics.views'),
        'created': Sort('created'),
    }
    facets = {
        'organization': ModelTermFacet('organizations', Organization),
        'reuses': RangeFacet('metrics.reuses'),
        'datasets': RangeFacet('metrics.datasets'),
    }
    boosters = [
        GaussDecay('metrics.reuses', 50, decay=0.8),
        GaussDecay('metrics.datasets', 50, decay=0.8),
        GaussDecay('metrics.followers', 200, 200, decay=0.8),
    ]

    @classmethod
    def serialize(cls, user):
        return {
            'first_name': user.first_name,
            'last_name': user.last_name,
            'about': user.about,
            'organizations': [str(o.id) for o in user.organizations],
            'metrics': user.metrics,
            'created': user.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'user_suggest': {
                'input': cls.completer_tokenize(user.fullname) + [user.id],
                'output': str(user.id),
                'payload': {
                    'avatar_url': user.avatar(40),
                    'first_name': user.first_name,
                    'last_name': user.last_name,
                    'slug': user.slug,
                },
            },
            'visible': user.visible
        }
Exemplo n.º 7
0
class DatasetSearch(ModelSearchAdapter):
    model = Dataset
    fuzzy = True
    mapping = {
        'properties': {
            'title': {
                'type': 'string',
                'analyzer': i18n_analyzer,
                'fields': {
                    'raw': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            },
            'description': {
                'type': 'string',
                'analyzer': i18n_analyzer
            },
            'license': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            'frequency': {
                'type': 'string'
            },
            'organization': {
                'type': 'string'
            },
            'owner': {
                'type': 'string'
            },
            'tags': {
                'type': 'string',
                'index_name': 'tag',
                'index': 'not_analyzed',
                'fields': {
                    'i18n': {
                        'type': 'string',
                        'analyzer': i18n_analyzer
                    }
                }
            },
            'badges': {
                'type': 'string',
                'index_name': 'badges',
                'index': 'not_analyzed'
            },
            'tag_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'resources': {
                'type': 'object',
                'index_name': 'resource',
                'properties': {
                    'title': {
                        'type': 'string'
                    },
                    'description': {
                        'type': 'string'
                    },
                    'license': {
                        'type': 'string'
                    },
                }
            },
            'format_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': False,
            },
            'dataset_suggest': {
                'type': 'completion',
                'index_analyzer': 'simple',
                'search_analyzer': 'simple',
                'payloads': True,
            },
            'created': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'last_modified': {
                'type': 'date',
                'format': 'date_hour_minute_second'
            },
            'metrics': metrics_mapping(Dataset),
            'featured': {
                'type': 'boolean'
            },
            # Store dates as ordinals to handle pre-1900 dates.
            'temporal_coverage': {
                'type': 'object',
                'properties': {
                    'start': {
                        'type': 'long'
                    },
                    'end': {
                        'type': 'long'
                    },
                }
            },
            'geozones': {
                'type': 'object',
                'index_name': 'geozones',
                'properties': {
                    'id': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'name': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'keys': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                }
            },
            'granularity': {
                'type': 'string',
                'index': 'not_analyzed'
            },
            # 'geom': {
            #     'type': 'geo_shape',
            #     'precision': '100m',
            # },
            'extras': {
                'type': 'object',
                'index_name': 'extra',
            },
        }
    }
    fields = (
        'geozones.keys^9',
        'geozones.name^9',
        'title^6',
        'tags.i18n^3',
        'description',
    )
    sorts = {
        'title': Sort('title.raw'),
        'created': Sort('created'),
        'last_modified': Sort('last_modified'),
        'reuses': Sort('metrics.reuses'),
        'followers': Sort('metrics.followers'),
        'views': Sort('metrics.views'),
    }
    facets = {
        'tag': TermFacet('tags'),
        'badge': TermFacet('badges', labelizer=dataset_badge_labelizer),
        'organization': ModelTermFacet('organization', Organization),
        'owner': ModelTermFacet('owner', User),
        'license': ModelTermFacet('license', License),
        'geozone': ModelTermFacet('geozones.id', GeoZone, zone_labelizer),
        'granularity': TermFacet('granularity', granularity_labelizer),
        'format': TermFacet('resources.format'),
        'reuses': RangeFacet('metrics.reuses'),
        'temporal_coverage': TemporalCoverageFacet('temporal_coverage'),
        'featured': BoolFacet('featured'),
        'extra': ExtrasFacet('extras'),
    }
    boosters = [
        BoolBooster('featured', 1.1),
        GaussDecay('metrics.reuses', max_reuses, decay=0.1),
        GaussDecay('metrics.followers',
                   max_followers,
                   max_followers,
                   decay=0.1),
    ]

    @classmethod
    def is_indexable(cls, dataset):
        return (dataset.deleted is None and len(dataset.resources) > 0
                and not dataset.private)

    @classmethod
    def serialize(cls, dataset):
        org_id = (str(dataset.organization.id)
                  if dataset.organization is not None else None)
        if dataset.organization:
            image_url = dataset.organization.logo(40)
        elif dataset.owner:
            image_url = dataset.owner.avatar(40)
        else:
            image_url = None

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            (dataset.license.id if dataset.license is not None else None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            org_id,
            'owner':
            str(dataset.owner.id) if dataset.owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'extras':
            dataset.extras,
            'featured':
            dataset.featured,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            document.update({
                'temporal_coverage': {
                    'start': dataset.temporal_coverage.start.toordinal(),
                    'end': dataset.temporal_coverage.end.toordinal(),
                }
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zones = GeoZone.objects(
                id__in=[z.id for z in dataset.spatial.zones])
            parents = set()
            geozones = []
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)

            geozones.extend([{'id': p} for p in parents])

            document.update({
                'geozones': geozones,
                # 'geom': dataset.spatial.geom,
                'granularity': dataset.spatial.granularity,
            })

        return document