def test_map_metrics(self): mapping = search.metrics_mapping(Fake) self.assertEqual(mapping, { 'type': 'object', 'index_name': 'metrics', 'properties': { 'fake-metric-int': { 'type': 'integer', }, 'fake-metric-float': { 'type': 'float', }, } })
def test_map_metrics(self): mapping = search.metrics_mapping(Fake) self.assertEqual( mapping, { 'type': 'object', 'index_name': 'metrics', 'properties': { 'fake-metric-int': { 'type': 'integer', }, 'fake-metric-float': { 'type': 'float', }, } })
class ReuseSearch(ModelSearchAdapter): model = Reuse fuzzy = True fields = ( 'title^4', 'description^2', 'datasets.title', ) facets = { 'tag': TermFacet('tags'), 'organization': ModelTermFacet('organization', Organization), 'owner': ModelTermFacet('owner', User), 'dataset': ModelTermFacet('dataset.id', Dataset), 'type': ReuseTypeFacet('type'), 'datasets': RangeFacet('metrics.datasets'), 'followers': RangeFacet('metrics.followers'), 'featured': BoolFacet('featured'), 'extra': ExtrasFacet('extras'), 'badge': TermFacet('badges', labelizer=reuse_badge_labelizer), } sorts = { 'title': Sort('title.raw'), 'created': Sort('created'), 'last_modified': Sort('last_modified'), 'datasets': Sort('metrics.datasets'), 'followers': Sort('metrics.followers'), 'views': Sort('metrics.views'), } mapping = { 'properties': { 'title': { 'type': 'string', 'analyzer': i18n_analyzer, 'fields': { 'raw': { 'type': 'string', 'index': 'not_analyzed' } } }, 'description': { 'type': 'string', 'analyzer': i18n_analyzer }, 'url': { 'type': 'string' }, 'organization': { 'type': 'string' }, 'owner': { 'type': 'string' }, 'type': { 'type': 'string' }, 'tags': { 'type': 'string', 'index_name': 'tag', 'index': 'not_analyzed' }, 'tag_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'badges': { 'type': 'string', 'index_name': 'badges', 'index': 'not_analyzed' }, 'created': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'last_modified': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'dataset': { 'type': 'object', 'properties': { 'id': { 'type': 'string' }, 'title': { 'type': 'string' } } }, 'metrics': metrics_mapping(Reuse), 'featured': { 'type': 'boolean' }, 'reuse_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': True, }, 'extras': { 'type': 'object', 'index_name': 'extra', }, } } boosters = [ BoolBooster('featured', 1.1), GaussDecay('metrics.datasets', max_datasets, decay=0.8), GaussDecay('metrics.followers', max_followers, decay=0.8), ] @classmethod def is_indexable(cls, reuse): return (reuse.deleted is None and len(reuse.datasets) > 0 and not reuse.private) @classmethod def serialize(cls, reuse): """By default use the ``to_dict`` method and exclude ``_id``, ``_cls`` and ``owner`` fields. """ return { 'title': reuse.title, 'description': reuse.description, 'url': reuse.url, 'organization': (str(reuse.organization.id) if reuse.organization else None), 'owner': str(reuse.owner.id) if reuse.owner else None, 'type': reuse.type, 'tags': reuse.tags, 'tag_suggest': reuse.tags, 'badges': [badge.kind for badge in reuse.badges], 'created': reuse.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': reuse.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'dataset': [{ 'id': str(d.id), 'title': d.title } for d in reuse.datasets if isinstance(d, Dataset)], 'metrics': reuse.metrics, 'featured': reuse.featured, 'extras': reuse.extras, 'reuse_suggest': { 'input': cls.completer_tokenize(reuse.title) + [reuse.id], 'output': str(reuse.id), 'payload': { 'title': reuse.title, 'slug': reuse.slug, 'image_url': reuse.image(40), }, }, }
class OrganizationSearch(search.ModelSearchAdapter): model = Organization fuzzy = True fields = ( 'name^6', 'acronym^6', 'description', ) sorts = { 'name': search.Sort('name.raw'), 'reuses': search.Sort('metrics.reuses'), 'datasets': search.Sort('metrics.datasets'), 'followers': search.Sort('metrics.followers'), 'views': search.Sort('metrics.views'), 'created': search.Sort('created'), } facets = { 'reuses': search.RangeFacet('metrics.reuses'), 'badge': search.TermFacet('badges', labelizer=organization_badge_labelizer), 'permitted_reuses': search.RangeFacet('metrics.permitted_reuses'), 'datasets': search.RangeFacet('metrics.datasets'), 'followers': search.RangeFacet('metrics.followers'), } mapping = { 'properties': { 'name': { 'type': 'string', 'analyzer': search.i18n_analyzer, 'fields': { 'raw': { 'type': 'string', 'index': 'not_analyzed' } } }, 'acronym': { 'type': 'string', 'index': 'not_analyzed', }, 'description': { 'type': 'string', 'analyzer': search.i18n_analyzer }, 'badges': { 'type': 'string', 'index_name': 'badges', 'index': 'not_analyzed' }, 'url': { 'type': 'string' }, 'created': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'metrics': search.metrics_mapping(Organization), 'org_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': True, }, } } boosters = [ search.GaussDecay('metrics.followers', max_followers, decay=0.8), search.GaussDecay('metrics.reuses', max_reuses, decay=0.9), search.GaussDecay('metrics.datasets', max_datasets, decay=0.9), ] @classmethod def is_indexable(cls, org): return org.deleted is None @classmethod def serialize(cls, organization): completions = cls.completer_tokenize(organization.name) completions.append(organization.id) if organization.acronym: completions.append(organization.acronym) return { 'name': organization.name, 'acronym': organization.acronym, 'description': organization.description, 'url': organization.url, 'metrics': organization.metrics, 'badges': [badge.kind for badge in organization.badges], 'created': organization.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'org_suggest': { 'input': completions, 'output': str(organization.id), 'payload': { 'name': organization.name, 'acronym': organization.acronym, 'image_url': organization.logo(40), 'slug': organization.slug, }, } }
class DatasetSearch(ModelSearchAdapter): model = Dataset fuzzy = True mapping = { 'properties': { 'title': { 'type': 'string', 'analyzer': i18n_analyzer, 'fields': { 'raw': { 'type': 'string', 'index': 'not_analyzed' } } }, 'description': { 'type': 'string', 'analyzer': i18n_analyzer }, 'license': { 'type': 'string', 'index': 'not_analyzed' }, 'frequency': { 'type': 'string' }, 'organization': { 'type': 'string' }, 'owner': { 'type': 'string' }, 'supplier': { 'type': 'string' }, 'tags': { 'type': 'string', 'index_name': 'tag', 'index': 'not_analyzed' }, 'tag_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'resources': { 'type': 'object', 'index_name': 'resource', 'properties': { 'title': { 'type': 'string' }, 'description': { 'type': 'string' }, 'license': { 'type': 'string' }, } }, 'format_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'dataset_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': True, }, 'created': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'last_modified': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'metrics': metrics_mapping(Dataset), 'featured': { 'type': 'boolean' }, 'temporal_coverage': { # Store dates as ordinals to handle pre-1900 dates 'type': 'object', 'properties': { 'start': { 'type': 'long' }, 'end': { 'type': 'long' }, } }, 'territories': { 'type': 'object', 'index_name': 'territories', 'properties': { 'id': { 'type': 'string' }, 'name': { 'type': 'string' }, 'code': { 'type': 'string' }, } }, 'granularity': { 'type': 'string', 'index': 'not_analyzed' }, # 'geom': { # 'type': 'geo_shape', # 'precision': '100m', # }, 'extras': { 'type': 'object', 'index_name': 'extra', }, } } fields = ( 'title^6', 'tags^3', 'territories.name^3', 'description', 'code', ) sorts = { 'title': Sort('title.raw'), 'created': Sort('created'), 'last_modified': Sort('last_modified'), 'reuses': Sort('metrics.reuses'), 'followers': Sort('metrics.followers'), 'views': Sort('metrics.views'), } facets = { 'tag': TermFacet('tags'), 'organization': ModelTermFacet('organization', Organization), 'owner': ModelTermFacet('owner', User), 'supplier': ModelTermFacet('supplier', Organization), 'license': ModelTermFacet('license', License), 'territory': ModelTermFacet('territories.id', Territory), 'granularity': TermFacet('granularity', lambda l, v: SPATIAL_GRANULARITIES[v]), 'format': TermFacet('resources.format'), 'reuses': RangeFacet('metrics.reuses'), 'temporal_coverage': TemporalCoverageFacet('temporal_coverage'), 'featured': BoolFacet('featured'), 'extra': ExtrasFacet('extras'), } boosters = [ BoolBooster('featured', 1.1), BoolBooster('from_public_service', 1.3), GaussDecay('metrics.reuses', max_reuses, decay=0.8), GaussDecay('metrics.followers', max_followers, max_followers, decay=0.8), ] @classmethod def is_indexable(cls, dataset): return dataset.deleted is None and len( dataset.resources) > 0 and not dataset.private @classmethod def serialize(cls, dataset): org_id = str(dataset.organization.id ) if dataset.organization is not None else None supplier_id = str( dataset.supplier.id) if dataset.supplier is not None else None supplier_id = supplier_id if supplier_id != org_id else None if dataset.organization: image_url = dataset.organization.logo(40) elif dataset.owner: image_url = dataset.owner.avatar(40) else: image_url = None document = { 'title': dataset.title, 'description': dataset.description, 'license': dataset.license.id if dataset.license is not None else None, 'tags': dataset.tags, 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': org_id, 'owner': str(dataset.owner.id) if dataset.owner else None, 'supplier': supplier_id, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title), 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'extras': dataset.extras, 'featured': dataset.featured, 'from_public_service': dataset.organization.public_service if dataset.organization else False, # TODO: extract tis into plugin } if dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end: document.update({ 'temporal_coverage': { 'start': dataset.temporal_coverage.start.toordinal(), 'end': dataset.temporal_coverage.end.toordinal(), } }) if dataset.spatial is not None: document.update({ 'territories': [{ 'id': str(t.id), 'name': t.name, 'code': t.code } for t in dataset.spatial.territories], # 'geom': dataset.spatial.geom, 'granularity': dataset.spatial.granularity, }) return document
class UserSearch(ModelSearchAdapter): model = User fuzzy = True # analyzer = 'not_analyzed' mapping = { 'properties': { 'first_name': {'type': 'string'}, 'last_name': {'type': 'string'}, 'about': {'type': 'string', 'analyzer': i18n_analyzer}, 'organizations': {'type': 'string', 'index_name': 'organization'}, 'visible': {'type': 'boolean'}, 'metrics': metrics_mapping(User), 'created': {'type': 'date', 'format': 'date_hour_minute_second'}, 'user_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': True, }, } } fields = ( 'last_name^6', 'first_name^5', 'about' ) sorts = { 'last_name': Sort('last_name'), 'first_name': Sort('first_name'), 'datasets': Sort('metrics.datasets'), 'reuses': Sort('metrics.reuses'), 'followers': Sort('metrics.followers'), 'views': Sort('metrics.views'), 'created': Sort('created'), } facets = { 'organization': ModelTermFacet('organizations', Organization), 'reuses': RangeFacet('metrics.reuses'), 'datasets': RangeFacet('metrics.datasets'), } boosters = [ GaussDecay('metrics.reuses', 50, decay=0.8), GaussDecay('metrics.datasets', 50, decay=0.8), GaussDecay('metrics.followers', 200, 200, decay=0.8), ] @classmethod def serialize(cls, user): return { 'first_name': user.first_name, 'last_name': user.last_name, 'about': user.about, 'organizations': [str(o.id) for o in user.organizations], 'metrics': user.metrics, 'created': user.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'user_suggest': { 'input': cls.completer_tokenize(user.fullname) + [user.id], 'output': str(user.id), 'payload': { 'avatar_url': user.avatar(40), 'first_name': user.first_name, 'last_name': user.last_name, 'slug': user.slug, }, }, 'visible': user.visible }
class DatasetSearch(ModelSearchAdapter): model = Dataset fuzzy = True mapping = { 'properties': { 'title': { 'type': 'string', 'analyzer': i18n_analyzer, 'fields': { 'raw': { 'type': 'string', 'index': 'not_analyzed' } } }, 'description': { 'type': 'string', 'analyzer': i18n_analyzer }, 'license': { 'type': 'string', 'index': 'not_analyzed' }, 'frequency': { 'type': 'string' }, 'organization': { 'type': 'string' }, 'owner': { 'type': 'string' }, 'tags': { 'type': 'string', 'index_name': 'tag', 'index': 'not_analyzed', 'fields': { 'i18n': { 'type': 'string', 'analyzer': i18n_analyzer } } }, 'badges': { 'type': 'string', 'index_name': 'badges', 'index': 'not_analyzed' }, 'tag_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'resources': { 'type': 'object', 'index_name': 'resource', 'properties': { 'title': { 'type': 'string' }, 'description': { 'type': 'string' }, 'license': { 'type': 'string' }, } }, 'format_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'dataset_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': True, }, 'created': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'last_modified': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'metrics': metrics_mapping(Dataset), 'featured': { 'type': 'boolean' }, # Store dates as ordinals to handle pre-1900 dates. 'temporal_coverage': { 'type': 'object', 'properties': { 'start': { 'type': 'long' }, 'end': { 'type': 'long' }, } }, 'geozones': { 'type': 'object', 'index_name': 'geozones', 'properties': { 'id': { 'type': 'string', 'index': 'not_analyzed' }, 'name': { 'type': 'string', 'index': 'not_analyzed' }, 'keys': { 'type': 'string', 'index': 'not_analyzed' }, } }, 'granularity': { 'type': 'string', 'index': 'not_analyzed' }, # 'geom': { # 'type': 'geo_shape', # 'precision': '100m', # }, 'extras': { 'type': 'object', 'index_name': 'extra', }, } } fields = ( 'geozones.keys^9', 'geozones.name^9', 'title^6', 'tags.i18n^3', 'description', ) sorts = { 'title': Sort('title.raw'), 'created': Sort('created'), 'last_modified': Sort('last_modified'), 'reuses': Sort('metrics.reuses'), 'followers': Sort('metrics.followers'), 'views': Sort('metrics.views'), } facets = { 'tag': TermFacet('tags'), 'badge': TermFacet('badges', labelizer=dataset_badge_labelizer), 'organization': ModelTermFacet('organization', Organization), 'owner': ModelTermFacet('owner', User), 'license': ModelTermFacet('license', License), 'geozone': ModelTermFacet('geozones.id', GeoZone, zone_labelizer), 'granularity': TermFacet('granularity', granularity_labelizer), 'format': TermFacet('resources.format'), 'reuses': RangeFacet('metrics.reuses'), 'temporal_coverage': TemporalCoverageFacet('temporal_coverage'), 'featured': BoolFacet('featured'), 'extra': ExtrasFacet('extras'), } boosters = [ BoolBooster('featured', 1.1), GaussDecay('metrics.reuses', max_reuses, decay=0.1), GaussDecay('metrics.followers', max_followers, max_followers, decay=0.1), ] @classmethod def is_indexable(cls, dataset): return (dataset.deleted is None and len(dataset.resources) > 0 and not dataset.private) @classmethod def serialize(cls, dataset): org_id = (str(dataset.organization.id) if dataset.organization is not None else None) if dataset.organization: image_url = dataset.organization.logo(40) elif dataset.owner: image_url = dataset.owner.avatar(40) else: image_url = None document = { 'title': dataset.title, 'description': dataset.description, 'license': (dataset.license.id if dataset.license is not None else None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': org_id, 'owner': str(dataset.owner.id) if dataset.owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'extras': dataset.extras, 'featured': dataset.featured, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): document.update({ 'temporal_coverage': { 'start': dataset.temporal_coverage.start.toordinal(), 'end': dataset.temporal_coverage.end.toordinal(), } }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zones = GeoZone.objects( id__in=[z.id for z in dataset.spatial.zones]) parents = set() geozones = [] for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) geozones.extend([{'id': p} for p in parents]) document.update({ 'geozones': geozones, # 'geom': dataset.spatial.geom, 'granularity': dataset.spatial.granularity, }) return document