예제 #1
0
    class Mapping:
        class Meta:
            doc_type = "super_manual_mapping"
            excludes = ("garbage", )

        bar = field.String(fields={"raw": field.String(index="not_analyzed")})
        status = field.String(index="not_analyzed")
예제 #2
0
class PositionDoc(field.InnerObjectWrapper):
    title = field.String()
    organization = field.Object(
        doc_class=OrganizationDoc,
        properties={
            'name': field.String(),
        }
    )
예제 #3
0
class Document(DocType):
    id = field.Integer()
    title = field.String(analyzer='snowball'),
    author = field.String(analyzer='snowball'),
    creation_date = field.Date(),
    pages = field.Integer(),
    content = field.String(analyzer='snowball'),
    lang = field.String(),
    size = field.Integer(),
    tags = field.String(index='not_analyzed')
    autocomplete = field.Text(analyzer = ngram_analyzer)
예제 #4
0
class SerializedDoc(DocType):
    _meta = field.Object(
        properties={'model': field.String(fields={'raw': field.String(index='not_analyzed')})}
    )

    def get_model_meta(self):
        return getattr(self, '_meta', None)

    def get_result_highlight(self):
        highlight = getattr(self.meta, 'highlight', None)
        if highlight:
            return getattr(highlight, '_d_', None)
        return None

    def get_display_name(self):
        return None
예제 #5
0
def test_field_supports_multiple_analyzers():
    f = field.String(index_analyzer='snowball', search_analyzer='keyword')
    assert {
        'index_analyzer': 'snowball',
        'search_analyzer': 'keyword',
        'type': 'string'
    } == f.to_dict()
예제 #6
0
 class User(document.DocType):
     username = field.String()
     class Meta:
         all = document.MetaField(enabled=False)
         _index = document.MetaField(enabled=True)
         dynamic = document.MetaField('strict')
         dynamic_templates = document.MetaField([42])
예제 #7
0
def test_multifield_supports_multiple_analyzers():
    f = field.String(
        fields={
            'f1': field.String(search_analyzer='keyword', analyzer='snowball'),
            'f2': field.String(analyzer='keyword')
        })
    assert {
        'fields': {
            'f1': {
                'analyzer': 'snowball',
                'search_analyzer': 'keyword',
                'type': 'string'
            },
            'f2': {
                'analyzer': 'keyword',
                'type': 'string'
            }
        },
        'type': 'string'
    } == f.to_dict()
예제 #8
0
    class User(document.DocType):
        pwd_hash = field.String()

        def check_password(self, pwd):
            return md5(pwd).hexdigest() == self.pwd_hash

        @property
        def password(self):
            raise AttributeError('readonly')

        @password.setter
        def password(self, pwd):
            self.pwd_hash = md5(pwd).hexdigest()
예제 #9
0
class OrganizationDoc(SerializedDoc):
    name = field.String()
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    mission = field.String()
    countries = field.Nested(doc_class=CountryDoc)
    headquarters_location = field.String(fields={'raw': field.String(index='not_analyzed')})
    scope_of_operations = field.String(
        multi=True,
        fields={'raw': field.String(index='not_analyzed')}
    )
    start_year = field.Integer()

    def get_display_name(self):
        return self.name
예제 #10
0
class PersonDoc(SerializedDoc):
    identifier = field.String()
    given_name = field.String()
    additional_name = field.String()
    family_name = field.String()
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    citizenships = field.Nested(doc_class=CountryDoc)
    position_set = field.Nested(
        doc_class=PositionDoc,
        properties={
            'title': field.String(),
            'organization': field.Object(properties={'name': field.String()})
        }
    )
    events = field.Nested(properties={'name': field.String()})

    def get_display_name(self):
        return " ".join((self.given_name, self.family_name))
예제 #11
0
class EventDoc(SerializedDoc):
    name = field.String()
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    event_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})})
    start_year = field.Integer()
    places = field.Nested(
        doc_class=PlaceDoc,
        properties={'location_display': field.String(fields={'raw': field.String(index='not_analyzed')})}
    )

    def get_display_name(self):
        return self.name
예제 #12
0
def test_multi_fields_are_accepted_and_parsed():
    f = field.construct_field(
        'string',
        fields={
            'raw': {'type': 'string', 'index': 'not_analyzed'},
            'eng': field.String(analyzer='english'),
        }
    )

    assert isinstance(f, field.String)
    assert {
        'type': 'string',
        'fields': {
            'raw': { 'type': 'string', 'index': 'not_analyzed'},
            'eng': { 'type': 'string', 'analyzer': 'english'},
        }
    } == f.to_dict()
예제 #13
0
class EntryDoc(SerializedDoc):
    title = field.String()
    author = field.String()
    content = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    publication_date = field.Date()
    categories = field.Nested(
        doc_class=CategoryDoc,
        properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}
    )

    def get_display_name(self):
        return self.title
예제 #14
0
class InitiativeDoc(SerializedDoc):
    identifier = field.String()
    name = field.String()
    principal_agent = field.Nested(multi=False, properties={'name': field.String()})
    member_countries = field.Nested(doc_class=CountryDoc)
    geographic_scope = field.Nested(
        doc_class=CountryDoc,
        properties={
            'name': field.String(fields={'raw': field.String(index='not_analyzed')})
        }
    )
    initiative_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})})
    start_year = field.Integer()

    def get_display_name(self):
        return self.name
예제 #15
0
class MySubDoc(MyDoc):
    name = field.String(index='not_analyzed')

    class Meta:
        doc_type = 'my_custom_doc'
        index = 'default-index'
예제 #16
0
class CountryDoc(field.InnerObjectWrapper):
    name = field.String()
예제 #17
0
class MyDoc(document.DocType):
    title = field.String(index='not_analyzed')
    name = field.String()
    created_at = field.Date()
    inner = field.Object(properties={'old_field': field.String()},
                         doc_class=MyInner)
예제 #18
0
 class Blog(document.DocType):
     tags = field.String(multi=True, index='not_analyzed')
예제 #19
0
    class User(document.DocType):
        username = field.String()

        class Meta:
            all = document.MetaField(enabled=False)
            _index = document.MetaField(enabled=True)
예제 #20
0
class RegionDoc(field.InnerObjectWrapper):
    name = field.String(fields={'raw': field.String(index='not_analyzed')})
예제 #21
0
 def builtin_type(self):
     return field.String(**self._params)
예제 #22
0
 class Mapping:
     name = field.String(analyzer="autocomplete",
                         fields={"raw": field.String(index="not_analyzed")})
     slug = field.String(index="not_analyzed")
     section_logo = ElasticsearchImageField()
     query = field.Object(enabled=False)
예제 #23
0
class WikiDocumentType(document.DocType):
    excerpt_fields = ['summary', 'content']
    exclude_slugs = [
        'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:'
    ]

    boost = field.Float(null_value=1.0)
    content = field.String(analyzer='kuma_content',
                           term_vector='with_positions_offsets')
    css_classnames = field.String(analyzer='case_insensitive_keyword')
    html_attributes = field.String(analyzer='case_insensitive_keyword')
    id = field.Long()
    kumascript_macros = field.String(analyzer='case_insensitive_keyword')
    locale = field.String(index='not_analyzed')
    modified = field.Date()
    parent = field.Nested(
        properties={
            'id': field.Long(),
            'title': field.String(analyzer='kuma_title'),
            'slug': field.String(index='not_analyzed'),
            'locale': field.String(index='not_analyzed'),
        })
    slug = field.String(index='not_analyzed')
    summary = field.String(analyzer='kuma_content',
                           term_vector='with_positions_offsets')
    tags = field.String(analyzer='case_sensitive')
    title = field.String(analyzer='kuma_title', boost=1.2)

    class Meta(object):
        mapping = Mapping('wiki_document')
        mapping.meta('_all', enalbed=False)

    @classmethod
    def get_connection(cls, alias='default'):
        return connections.get_connection(alias)

    @classmethod
    def get_doc_type(cls):
        return cls._doc_type.name

    @classmethod
    def from_django(cls, obj):
        doc = {
            'id': obj.id,
            'title': obj.title,
            'slug': obj.slug,
            'summary': obj.get_summary(strip_markup=True),
            'locale': obj.locale,
            'modified': obj.modified,
            'content': strip_tags(obj.rendered_html),
            'tags': list(obj.tags.values_list('name', flat=True)),
            'kumascript_macros': obj.extract_kumascript_macro_names(),
            'css_classnames': obj.extract_css_classnames(),
            'html_attributes': obj.extract_html_attributes(),
        }

        # Check if the document has a document zone attached
        try:
            is_zone = bool(obj.zone)
        except ObjectDoesNotExist:
            is_zone = False

        if is_zone:
            # boost all documents that are a zone
            doc['boost'] = 8.0
        elif obj.slug.count('/') == 1:
            # a little boost if no zone but still first level
            doc['boost'] = 4.0
        else:
            doc['boost'] = 1.0
        if obj.parent:
            doc['parent'] = {
                'id': obj.parent.id,
                'title': obj.parent.title,
                'locale': obj.parent.locale,
                'slug': obj.parent.slug,
            }
        else:
            doc['parent'] = {}

        return doc

    @classmethod
    def get_mapping(cls):
        return cls._doc_type.mapping.to_dict()

    @classmethod
    def get_analysis(cls):
        return {
            'filter': {
                'kuma_word_delimiter': {
                    'type': 'word_delimiter',
                    'preserve_original': True,  # hi-fi -> hifi, hi-fi
                    'catenate_words': True,  # hi-fi -> hifi
                    'catenate_numbers': True,  # 90-210 -> 90210
                }
            },
            'analyzer': {
                'default': {
                    'tokenizer': 'standard',
                    'filter': ['standard', 'elision']
                },
                # a custom analyzer that strips html and uses our own
                # word delimiter filter and the elision filter
                # (e.g. L'attribut -> attribut). The rest is the same as
                # the snowball analyzer
                'kuma_content': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'char_filter': ['html_strip'],
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'stop',
                        'snowball',
                    ],
                },
                'kuma_title': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'snowball',
                    ],
                },
                'case_sensitive': {
                    'type': 'custom',
                    'tokenizer': 'keyword'
                },
                'case_insensitive_keyword': {
                    'type': 'custom',
                    'tokenizer': 'keyword',
                    'filter': 'lowercase'
                }
            },
        }

    @classmethod
    def get_settings(cls):
        return {
            'mappings': cls.get_mapping(),
            'settings': {
                'analysis': cls.get_analysis(),
                'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
                'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
            }
        }

    @classmethod
    def bulk_index(cls, documents, id_field='id', es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_index': index,
            '_type': type,
            '_id': d['id'],
            '_source': d
        } for d in documents]

        bulk(es, actions)

    @classmethod
    def bulk_delete(cls, ids, es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_op_type': 'delete',
            '_index': index,
            '_type': type,
            '_id': _id
        } for _id in ids]

        bulk(es, actions)

    @classmethod
    def get_index(cls):
        from kuma.search.models import Index
        return Index.objects.get_current().prefixed_name

    @classmethod
    def search(cls, **kwargs):
        options = {
            'using': connections.get_connection(),
            'index': cls.get_index(),
            'doc_type': {
                cls._doc_type.name: cls.from_es
            },
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq

    @classmethod
    def get_model(cls):
        from kuma.wiki.models import Document
        return Document

    @classmethod
    def get_indexable(cls, percent=100):
        """
        For this mapping type return a list of model IDs that should be
        indexed with the management command, in a full reindex.

        WARNING: When changing this code make sure to update the
                 ``should_update`` method below, too!

        """
        model = cls.get_model()

        excludes = []
        for exclude in cls.exclude_slugs:
            excludes.append(Q(slug__icontains=exclude))

        qs = (model.objects.filter(is_template=False,
                                   is_redirect=False,
                                   deleted=False).exclude(
                                       reduce(operator.or_, excludes)))

        percent = percent / 100
        if percent < 1:
            qs = qs[:int(qs.count() * percent)]

        return qs.values_list('id', flat=True)

    @classmethod
    def should_update(cls, obj):
        """
        Given a Document instance should return boolean value
        whether the instance should be indexed or not.

        WARNING: This *must* mirror the logic of the ``get_indexable``
                 method above!
        """
        return (not obj.is_template and not obj.is_redirect and not obj.deleted
                and
                not any([exclude in obj.slug
                         for exclude in cls.exclude_slugs]))

    def get_excerpt(self):
        if getattr(self, 'highlight', False):
            for excerpt_field in self.excerpt_fields:
                if excerpt_field in self.highlight:
                    return u'…'.join(self.highlight[excerpt_field])
        return self.summary

    @classmethod
    def reindex_all(cls, chunk_size=500, index=None, percent=100):
        """Rebuild ElasticSearch indexes.

        :arg chunk_size: how many documents to bulk index as a single chunk.
        :arg index: the `Index` object to reindex into. Uses the current
            promoted index if none provided.
        :arg percent: 1 to 100--the percentage of the db to index.

        """
        from kuma.search.models import Index
        from kuma.search.tasks import prepare_index, finalize_index
        from kuma.wiki.tasks import index_documents

        index = index or Index.objects.get_current()

        # Get the list of document IDs to index.
        indexable = WikiDocumentType.get_indexable(percent)

        total = len(indexable)
        total_chunks = int(ceil(total / chunk_size))

        pre_task = prepare_index.si(index.pk)
        post_task = finalize_index.si(index.pk)

        if not total:
            # If there's no data we still create the index and finalize it.
            chain(pre_task, post_task).apply_async()
        else:
            index_tasks = [
                index_documents.si(chunk, index.pk)
                for chunk in chunked(indexable, chunk_size)
            ]
            chord_flow(pre_task, index_tasks, post_task).apply_async()

        message = _(
            'Indexing {total} documents into {n} chunks of size {size} into '
            'index {index}.'.format(total=total,
                                    n=total_chunks,
                                    size=chunk_size,
                                    index=index.prefixed_name))
        return message
예제 #24
0
class ProjectDoc(SerializedDoc):
    identifier = field.String()
    name = field.String()
    alternate_name = field.String()
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    status = field.String(fields={'raw': field.String(index='not_analyzed')})
    start_year = field.Integer()
    countries = field.Nested(
        doc_class=CountryDoc,  # project_location aggregation/facet uses the raw multifield
        properties={
            'name': field.String(fields={'raw': field.String(index='not_analyzed')})
        }
    )
    infrastructure_type = field.Object(
        properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}
    )
    # Providing a doc_class for initiatives produced errors, so keep it simple!
    initiatives = field.Nested(properties={'name': field.String()})
    funding = field.Object(
        multi=True,
        properties={
            'sources': field.Object(
                multi=True,
                properties={
                    'name': field.String(fields={'raw': field.String(index='not_analyzed')}),
                }
            )
        }
    )
    regions = field.Nested(
        doc_class=RegionDoc,
        properties={
            'name': field.String(fields={'raw': field.String(index='not_analyzed')})
        }
    )

    def get_display_name(self):
        return self.name
예제 #25
0
class SimpleCommit(document.DocType):
    files = field.String(multi=True)

    class Meta:
        index = 'test-git'
예제 #26
0
class PlaceDoc(field.InnerObjectWrapper):
    city = field.String(fields={'raw': field.String(index='not_analyzed')})
    country = field.Object(doc_class=CountryDoc)
    label = field.String(fields={'raw': field.String(index='not_analyzed')})
    location_display = field.String(fields={'raw': field.String(index='not_analyzed')})
예제 #27
0
    class MyD(document.DocType):
        title = field.String()

        class Meta:
            mapping = Mapping('my_d')
            mapping.meta('_all', enabled=False)
예제 #28
0
def test_nested_provides_direct_access_to_its_fields():
    f = field.Nested()
    f.field('name', 'string', index='not_analyzed')

    assert 'name' in f
    assert f['name'] == field.String(index='not_analyzed')
예제 #29
0
class DocWithNested(document.DocType):
    comments = field.Nested(properties={'title': field.String()})
예제 #30
0
class CategoryDoc(field.InnerObjectWrapper):
    name = field.String()