class InitiativeDoc(SerializedDoc): identifier = field.String() name = field.String() principal_agent = field.Nested(multi=False, properties={'name': field.String()}) member_countries = field.Nested(doc_class=CountryDoc) geographic_scope = field.Nested( doc_class=CountryDoc, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) initiative_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}) start_year = field.Integer() def get_display_name(self): return self.name
def prepare_doc(self): _fields, _map = {}, {} for idx, _f in enumerate(self.schema['fields'], 1): alias_name = _f['name'] field_name = 'col{}'.format(idx) _field = self._schema2doc_map[_f['type']] _map[field_name] = alias_name _fields[field_name] = _field if self.has_geo_data: _fields['shape'] = dsl_field.GeoShape() _fields['point'] = dsl_field.GeoPoint() _fields['label'] = dsl_field.Text() _fields['shape_type'] = dsl_field.Integer() _fields['resource'] = dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text(analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) }) _fields['updated_at'] = dsl_field.Date() _fields['row_no'] = dsl_field.Long() _fields['Index'] = type('Index', (type, ), {'name': self.idx_name}) doc = type(self.idx_name, (Document, ), _fields) doc._doc_type.mapping._meta['_meta'] = {'headers': _map} return doc
def doc(self): if not self._doc_cache: _fields, _map = {}, {} for idx, _f in enumerate(self.schema['fields']): alias_name = _f['name'] field_name = 'col{}'.format(idx + 1) _field = _schema2doc_map[_f['type']] _map[field_name] = alias_name _fields[field_name] = _field _fields['resource'] = dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text( analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) } ) _fields['updated_at'] = dsl_field.Date() _fields['row_no'] = dsl_field.Long() doc = type(self.idx_name, (DocType,), _fields) doc._doc_type.index = self.idx_name doc._doc_type.mapping._meta['_meta'] = {'headers': _map} doc._doc_type.mapping._meta['_meta'] self._doc_cache = doc return self._doc_cache
class DocWithNested(document.DocType): comments = field.Nested( properties={ 'title': field.Text(), 'tags': field.Keyword(multi=True) } )
class Mapping(Content.Mapping): # contributions = ContributionField() tags = field.Nested() class Meta: orphaned = True includes = ('tags',)
def test_nested_provides_direct_access_to_its_fields(): f = field.Nested( properties={'name': { 'type': 'text', 'index': 'not_analyzed' }}) assert 'name' in f assert f['name'] == field.Text(index='not_analyzed')
def test_nested_provides_direct_access_to_its_fields(): f = field.Nested( properties={"name": { "type": "text", "index": "not_analyzed" }}) assert "name" in f assert f["name"] == field.Text(index="not_analyzed")
class Entry(document.Document): forms = field.Nested(Form) created = field.Date() superentry = field.Text() def save(self, **kwargs): return super(Entry, self).save(**kwargs) def is_published(self): return datetime.now() > self.created
def test_modifying_nested(): f = field.Nested() f = f.field('name', 'string', index='not_analyzed') assert { 'type': 'nested', 'properties': { 'name': {'type': 'string', 'index': 'not_analyzed'} }, } == f.to_dict()
class ProjectDoc(SerializedDoc): identifier = field.String() name = field.String() alternate_name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) status = field.String(fields={'raw': field.String(index='not_analyzed')}) start_year = field.Integer() countries = field.Nested( doc_class=CountryDoc, # project_location aggregation/facet uses the raw multifield properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) infrastructure_type = field.Object( properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})} ) # Providing a doc_class for initiatives produced errors, so keep it simple! initiatives = field.Nested(properties={'name': field.String()}) funding = field.Object( multi=True, properties={ 'sources': field.Object( multi=True, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}), } ) } ) regions = field.Nested( doc_class=RegionDoc, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) def get_display_name(self): return self.name
class PersonDoc(SerializedDoc): identifier = field.String() given_name = field.String() additional_name = field.String() family_name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) citizenships = field.Nested(doc_class=CountryDoc) position_set = field.Nested( doc_class=PositionDoc, properties={ 'title': field.String(), 'organization': field.Object(properties={'name': field.String()}) } ) events = field.Nested(properties={'name': field.String()}) def get_display_name(self): return " ".join((self.given_name, self.family_name))
class CompanyDocument(Document): address = field.Nested( properties={ 'care_of': field.Keyword(index=False, store=True), 'po_box': field.Keyword(index=False, store=True), 'address_line_1': field.Keyword(index=False, store=True), 'address_line_2': field.Keyword(index=False, store=True), 'locality': field.Keyword(index=False, store=True), 'region': field.Keyword(index=False, store=True), 'country': field.Keyword(index=False, store=True), 'postal_code': field.Keyword(index=False, store=True) }) country_of_origin = field.Keyword(index=False, store=True) address_snippet = field.Keyword(index=False, store=True) company_name = field.Text() company_number = field.Text() company_status = field.Keyword(index=False, store=True) type = field.Keyword(index=False, store=True) date_of_cessation = field.Date(index=False, format='yyyy-MM-dd') date_of_creation = field.Date(index=False, format='yyyy-MM-dd') sic_codes = field.Keyword(index=False, store=True) class Meta: index = settings.ELASTICSEARCH_COMPANY_INDEX_ALIAS def to_dict(self, include_meta=False): meta = super().to_dict(include_meta) if '_source' in meta: company = meta['_source'] company['title'] = company['company_name'] company['address']['country'] = company['country_of_origin'] company['company_type'] = company['type'] meta['_source'] = self.reformat_date(company) return meta def to_profile_dict(self): company = self.to_dict() company['registered_office_address'] = company['address'] return self.reformat_date(company) @staticmethod def reformat_date(company): if 'date_of_creation' in company: company['date_of_creation'] = ( company['date_of_creation'].strftime('%Y-%m-%d')) if 'date_of_cessation' in company: company['date_of_cessation'] = ( company['date_of_cessation'].strftime('%Y-%m-%d')) return company
class Profile(Document): created_at = field.Date() ssn_trace = field.Nested(Ssn_trace) meta_sub_profiles = field.Nested(Inner_sub_profile) class Index: name = "profile" @property def pk(self): return self.meta.id @property def sub_profiles(self): return [p.sub_profile for p in self.meta_sub_profiles] def validate_sub_profiles(self, *sub_profiles_ids): valid_sub_profiles_ids = { s.sub_profile_id for s in self.meta_sub_profiles } for sub_profile_id in sub_profiles_ids: if sub_profile_id not in valid_sub_profiles_ids: raise ValueError( "the profile: '{}' dont have the sub profile: '{}'".format( self.meta.id, sub_profile_id)) def attach_sub_profiles(self, sub_profiles): if not isinstance(self.meta_sub_profiles, list): self.meta_sub_profiles = [] for sub_profile in sub_profiles: self.meta_sub_profiles.append({ 'sub_profile_id': sub_profile.meta.id, 'status': 'unknown' })
class EventDoc(SerializedDoc): name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) event_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}) start_year = field.Integer() places = field.Nested( doc_class=PlaceDoc, properties={'location_display': field.String(fields={'raw': field.String(index='not_analyzed')})} ) def get_display_name(self): return self.name
class OrganizationDoc(SerializedDoc): name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) mission = field.String() countries = field.Nested(doc_class=CountryDoc) headquarters_location = field.String(fields={'raw': field.String(index='not_analyzed')}) scope_of_operations = field.String( multi=True, fields={'raw': field.String(index='not_analyzed')} ) start_year = field.Integer() def get_display_name(self): return self.name
class EntryDoc(SerializedDoc): title = field.String() author = field.String() content = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) publication_date = field.Date() categories = field.Nested( doc_class=CategoryDoc, properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})} ) def get_display_name(self): return self.title
def prepare_doc(self): _fields = { 'shape': dsl_field.GeoShape(), 'point': dsl_field.GeoPoint(), 'shape_type': dsl_field.Integer(), 'label': dsl_field.Text(), 'resource': dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text(analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) }), 'updated_at': dsl_field.Date(), 'row_no': dsl_field.Long() } _map = {} for idx, _f in enumerate(self.schema, 1): if _f.type not in self._schema2doc_map: continue alias_name = _f.name field_name = f'col{idx}' _field = self._schema2doc_map[_f.type] _map[field_name] = alias_name _fields[field_name] = _field _fields['Index'] = type('Index', (type, ), {'name': self.idx_name}) doc = type(self.idx_name, (Document, ), _fields) doc._doc_type.mapping._meta['_meta'] = {'headers': _map} return doc
class CompanyDocType(DocType): date_of_creation = FormattedDate(date_format='%Y-%m-%d') description = field.Text() employees = field.Text() facebook_url = field.Text() pk = field.Integer() keywords = field.Text() linkedin_url = field.Text() logo = field.Text() has_single_sector = field.Boolean() modified = FormattedDate(date_format='%Y-%m-%dT%H:%M:%S.%fZ') name = field.Text() number = field.Text() sectors = field.Text(multi=True) sectors_label = field.Text(multi=True) slug = field.Text() summary = field.Text() twitter_url = field.Text() website = field.Text() supplier_case_studies = field.Nested( properties={ 'pk': field.Integer(), 'title': field.Text(), 'short_summary': field.Text(), 'description': field.Text(), 'sector': field.Text(), 'keywords': field.Text(), 'image_one_caption': field.Text(), 'image_two_caption': field.Text(), 'image_three_caption': field.Text(), 'testimonial': field.Text(), 'slug': field.Text(), }) class Meta: index = 'company'
class DocWithNested(document.DocType): comments = field.Nested(properties={'title': field.String()})
class OptionalObjectWithRequiredField(document.Document): comments = field.Nested(properties={'title': field.Keyword(required=True)})
class CompanyDocument(Document): wildcard = field.Text(analyzer=american_english_analyzer) casestudy_wildcard = field.Text(analyzer=american_english_analyzer) keyword_wildcard = field.Keyword() case_study_count = field.Integer() date_of_creation = field.Date(index=False) description = field.Text( copy_to='wildcard', analyzer=american_english_analyzer ) has_description = field.Boolean() employees = field.Keyword(index=False, store=True) facebook_url = field.Keyword(index=False, store=True) pk = field.Integer(index=False) keywords = field.Text(copy_to='wildcard') linkedin_url = field.Keyword(index=False, store=True) logo = field.Keyword(index=False, store=True) has_single_sector = field.Boolean() modified = field.Date(index=False) ordering_name = field.Keyword() name = field.Text(copy_to=['wildcard', 'ordering_name']) number = field.Keyword(copy_to='keyword_wildcard',) sectors = field.Keyword(multi=True, copy_to='keyword_wildcard', store=True) sectors_label = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_industries = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_regions = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_languages = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_countries = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) # Represents Dict as it's the primitive datatype for this field expertise_products_services = field.Object() expertise_products_services_labels = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_labels = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) slug = field.Keyword(copy_to='keyword_wildcard', store=True) summary = field.Text( copy_to='wildcard', analyzer=american_english_analyzer ) twitter_url = field.Keyword(index=False, store=True) website = field.Keyword(copy_to='keyword_wildcard', store=True) supplier_case_studies = field.Nested( properties={ 'pk': field.Integer(index=False), 'title': field.Text(copy_to='casestudy_wildcard'), 'short_summary': field.Text(copy_to='casestudy_wildcard'), 'description': field.Text(copy_to='casestudy_wildcard'), 'sector': field.Keyword(copy_to='keyword_wildcard', store=True), 'keywords': field.Text(copy_to='casestudy_wildcard'), 'image_one_caption': field.Text(copy_to='casestudy_wildcard'), 'image_two_caption': field.Text(copy_to='casestudy_wildcard'), 'image_three_caption': field.Text(copy_to='casestudy_wildcard'), 'testimonial': field.Text(copy_to='casestudy_wildcard'), 'website': field.Keyword(copy_to='casestudy_wildcard', store=True), 'slug': field.Keyword(copy_to='keyword_wildcard', store=True), 'testimonial_name': field.Keyword( copy_to='casestudy_wildcard', store=True ), 'testimonial_company': field.Text(copy_to='casestudy_wildcard'), 'testimonial_job_title': field.Text(copy_to='casestudy_wildcard'), } ) is_showcase_company = field.Boolean() is_published_investment_support_directory = field.Boolean() is_published_find_a_supplier = field.Boolean() class Meta: index = settings.ELASTICSEARCH_COMPANY_INDEX_ALIAS
class DocWithNested(document.Document): comments = field.Nested(Comment)
class NestedSecret(document.Document): secrets = field.Nested(SecretDoc)
class MySubDocWithNested(MyDoc): nested_inner = field.Nested(MyInner)
def test_nested_provides_direct_access_to_its_fields(): f = field.Nested() f.field('name', 'string', index='not_analyzed') assert 'name' in f assert f['name'] == field.String(index='not_analyzed')
class DocWithNested(document.Document): comments = field.Nested(Comment) class Index: name = "test-doc-with-nested"
class OptionalObjectWithRequiredField(document.Document): comments = field.Nested(properties={"title": field.Keyword(required=True)}) class Index: name = "test-required"
class NestedSecret(document.Document): secrets = field.Nested(SecretDoc) class Index: name = "test-nested-secret"
class WikiDocumentType(document.DocType): excerpt_fields = ['summary', 'content'] exclude_slugs = [ 'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:' ] boost = field.Float(null_value=1.0) content = field.String(analyzer='kuma_content', term_vector='with_positions_offsets') css_classnames = field.String(analyzer='case_insensitive_keyword') html_attributes = field.String(analyzer='case_insensitive_keyword') id = field.Long() kumascript_macros = field.String(analyzer='case_insensitive_keyword') locale = field.String(index='not_analyzed') modified = field.Date() parent = field.Nested( properties={ 'id': field.Long(), 'title': field.String(analyzer='kuma_title'), 'slug': field.String(index='not_analyzed'), 'locale': field.String(index='not_analyzed'), }) slug = field.String(index='not_analyzed') summary = field.String(analyzer='kuma_content', term_vector='with_positions_offsets') tags = field.String(analyzer='case_sensitive') title = field.String(analyzer='kuma_title', boost=1.2) class Meta(object): mapping = Mapping('wiki_document') mapping.meta('_all', enalbed=False) @classmethod def get_connection(cls, alias='default'): return connections.get_connection(alias) @classmethod def get_doc_type(cls): return cls._doc_type.name @classmethod def from_django(cls, obj): doc = { 'id': obj.id, 'title': obj.title, 'slug': obj.slug, 'summary': obj.get_summary(strip_markup=True), 'locale': obj.locale, 'modified': obj.modified, 'content': strip_tags(obj.rendered_html), 'tags': list(obj.tags.values_list('name', flat=True)), 'kumascript_macros': obj.extract_kumascript_macro_names(), 'css_classnames': obj.extract_css_classnames(), 'html_attributes': obj.extract_html_attributes(), } # Check if the document has a document zone attached try: is_zone = bool(obj.zone) except ObjectDoesNotExist: is_zone = False if is_zone: # boost all documents that are a zone doc['boost'] = 8.0 elif obj.slug.count('/') == 1: # a little boost if no zone but still first level doc['boost'] = 4.0 else: doc['boost'] = 1.0 if obj.parent: doc['parent'] = { 'id': obj.parent.id, 'title': obj.parent.title, 'locale': obj.parent.locale, 'slug': obj.parent.slug, } else: doc['parent'] = {} return doc @classmethod def get_mapping(cls): return cls._doc_type.mapping.to_dict() @classmethod def get_analysis(cls): return { 'filter': { 'kuma_word_delimiter': { 'type': 'word_delimiter', 'preserve_original': True, # hi-fi -> hifi, hi-fi 'catenate_words': True, # hi-fi -> hifi 'catenate_numbers': True, # 90-210 -> 90210 } }, 'analyzer': { 'default': { 'tokenizer': 'standard', 'filter': ['standard', 'elision'] }, # a custom analyzer that strips html and uses our own # word delimiter filter and the elision filter # (e.g. L'attribut -> attribut). The rest is the same as # the snowball analyzer 'kuma_content': { 'type': 'custom', 'tokenizer': 'standard', 'char_filter': ['html_strip'], 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'stop', 'snowball', ], }, 'kuma_title': { 'type': 'custom', 'tokenizer': 'standard', 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'snowball', ], }, 'case_sensitive': { 'type': 'custom', 'tokenizer': 'keyword' }, 'case_insensitive_keyword': { 'type': 'custom', 'tokenizer': 'keyword', 'filter': 'lowercase' } }, } @classmethod def get_settings(cls): return { 'mappings': cls.get_mapping(), 'settings': { 'analysis': cls.get_analysis(), 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, } } @classmethod def bulk_index(cls, documents, id_field='id', es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_index': index, '_type': type, '_id': d['id'], '_source': d } for d in documents] bulk(es, actions) @classmethod def bulk_delete(cls, ids, es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_op_type': 'delete', '_index': index, '_type': type, '_id': _id } for _id in ids] bulk(es, actions) @classmethod def get_index(cls): from kuma.search.models import Index return Index.objects.get_current().prefixed_name @classmethod def search(cls, **kwargs): options = { 'using': connections.get_connection(), 'index': cls.get_index(), 'doc_type': { cls._doc_type.name: cls.from_es }, } options.update(kwargs) sq = Search(**options) return sq @classmethod def get_model(cls): from kuma.wiki.models import Document return Document @classmethod def get_indexable(cls, percent=100): """ For this mapping type return a list of model IDs that should be indexed with the management command, in a full reindex. WARNING: When changing this code make sure to update the ``should_update`` method below, too! """ model = cls.get_model() excludes = [] for exclude in cls.exclude_slugs: excludes.append(Q(slug__icontains=exclude)) qs = (model.objects.filter(is_template=False, is_redirect=False, deleted=False).exclude( reduce(operator.or_, excludes))) percent = percent / 100 if percent < 1: qs = qs[:int(qs.count() * percent)] return qs.values_list('id', flat=True) @classmethod def should_update(cls, obj): """ Given a Document instance should return boolean value whether the instance should be indexed or not. WARNING: This *must* mirror the logic of the ``get_indexable`` method above! """ return (not obj.is_template and not obj.is_redirect and not obj.deleted and not any([exclude in obj.slug for exclude in cls.exclude_slugs])) def get_excerpt(self): if getattr(self, 'highlight', False): for excerpt_field in self.excerpt_fields: if excerpt_field in self.highlight: return u'…'.join(self.highlight[excerpt_field]) return self.summary @classmethod def reindex_all(cls, chunk_size=500, index=None, percent=100): """Rebuild ElasticSearch indexes. :arg chunk_size: how many documents to bulk index as a single chunk. :arg index: the `Index` object to reindex into. Uses the current promoted index if none provided. :arg percent: 1 to 100--the percentage of the db to index. """ from kuma.search.models import Index from kuma.search.tasks import prepare_index, finalize_index from kuma.wiki.tasks import index_documents index = index or Index.objects.get_current() # Get the list of document IDs to index. indexable = WikiDocumentType.get_indexable(percent) total = len(indexable) total_chunks = int(ceil(total / chunk_size)) pre_task = prepare_index.si(index.pk) post_task = finalize_index.si(index.pk) if not total: # If there's no data we still create the index and finalize it. chain(pre_task, post_task).apply_async() else: index_tasks = [ index_documents.si(chunk, index.pk) for chunk in chunked(indexable, chunk_size) ] chord_flow(pre_task, index_tasks, post_task).apply_async() message = _( 'Indexing {total} documents into {n} chunks of size {size} into ' 'index {index}.'.format(total=total, n=total_chunks, size=chunk_size, index=index.prefixed_name)) return message
class NestedSecret(document.DocType): secrets = field.Nested(properties={'title': SecretField()})