示例#1
0
def es_reindex_with_progress(doctypes=None, percent=100):
    """Rebuild Elastic indexes as you iterate over yielded progress ratios.

    :arg doctypes: Defaults to None which will index all doctypes.
        Otherwise indexes the doctypes specified. See
        :py:func:`.get_doctype_stats()` for what doctypes look like.
    :arg percent: Defaults to 100.  Allows you to specify how much of
        each doctype you want to index.  This is useful for
        development where doing a full reindex takes an hour.

    """
    from search.models import get_search_models

    es = elasticutils.get_es()

    search_models = get_search_models()
    if doctypes:
        search_models = [cls for cls in search_models
                         if cls._meta.db_table in doctypes]

    if len(search_models) == len(get_search_models()):
        index = settings.ES_INDEXES.get('default')
        if index is not None:
            # If we're indexing everything and there's a default index
            # specified in settings, then we delete and recreate it.
            es.delete_index_if_exists(index)
            es.create_index(index)

    total = sum([cls.objects.count() for cls in search_models])

    to_index = [cls.index_all(percent) for cls in search_models]

    return (float(done) / total for done, _ in
            izip(count(1), chain(*to_index)))
示例#2
0
def recreate_index(es=None):
    """Deletes index if it's there and creates a new one"""
    if es is None:
        es = get_es()

    from search.models import get_search_models

    search_models = get_search_models()
    merged_mapping = {
        SUMO_DOCTYPE: {
            'properties':
            merge_mappings([(cls._meta.db_table, cls.get_mapping())
                            for cls in search_models])
        }
    }

    index = WRITE_INDEX
    delete_index(index)

    # There should be no mapping-conflict race here since the index doesn't
    # exist. Live indexing should just fail.

    # Simultaneously create the index and the mappings, so live
    # indexing doesn't get a chance to index anything between the two
    # and infer a bogus mapping (which ES then freaks out over when we
    # try to lay in an incompatible explicit mapping).

    es.create_index(index, settings={'mappings': merged_mapping})
示例#3
0
文件: es_utils.py 项目: ibai/kitsune
def get_doctype_stats(index):
    """Returns a dict of name -> count for documents indexed.

    For example:

    >>> get_doctype_stats()
    {'questions_question': 14216, 'forums_thread': 419, 'wiki_document': 759}

    :throws pyes.urllib3.MaxRetryError: if it can't connect to elasticsearch
    :throws pyes.exceptions.IndexMissingException: if the index doesn't exist

    """
    from search.models import get_search_models

    conn = get_indexing_es()

    stats = {}
    for cls in get_search_models():
        query = pyes.query.TermQuery('model', cls.get_model_name())
        results = conn.count(query=query,
                             indexes=[index],
                             doc_types=[SUMO_DOCTYPE])
        stats[cls.get_model_name()] = results[u'count']

    return stats
示例#4
0
def es_reindex_with_progress(percent=100):
    """Rebuild Elastic indexes as you iterate over yielded progress ratios.

    :arg percent: Defaults to 100.  Allows you to specify how much of
        each doctype you want to index.  This is useful for
        development where doing a full reindex takes an hour.

    """
    from search.models import get_search_models

    search_models = get_search_models()

    es = elasticutils.get_es()
    index = settings.ES_INDEXES['default']
    es.delete_index_if_exists(index)
    # There should be no mapping-conflict race here since the index doesn't
    # exist. Live indexing should just fail.

    # Simultaneously create the index and the mappings, so live indexing
    # doesn't get a chance to index anything between the two and infer a bogus
    # mapping (which ES then freaks out over when we try to lay in an
    # incompatible explicit mapping).
    mappings = dict((cls._meta.db_table, {'properties': cls.get_mapping()})
                    for cls in search_models)
    es.create_index(index, settings={'mappings': mappings})

    total = sum([cls.objects.count() for cls in search_models])
    to_index = [cls.index_all(percent) for cls in search_models]
    return (float(done) / total for done, _ in
            izip(count(1), chain(*to_index)))
示例#5
0
def recreate_index(es=None):
    """Deletes index if it's there and creates a new one"""
    if es is None:
        es = get_indexing_es()

    from search.models import get_search_models

    search_models = get_search_models()
    merged_mapping = {
        SUMO_DOCTYPE: {
            'properties': merge_mappings(
                [(cls._meta.db_table, cls.get_mapping())
                 for cls in search_models])
            }
        }

    index = WRITE_INDEX
    delete_index(index)

    # There should be no mapping-conflict race here since the index doesn't
    # exist. Live indexing should just fail.

    # Simultaneously create the index and the mappings, so live
    # indexing doesn't get a chance to index anything between the two
    # and infer a bogus mapping (which ES then freaks out over when we
    # try to lay in an incompatible explicit mapping).

    es.create_index(index, settings={'mappings': merged_mapping})
示例#6
0
def get_doctype_stats(index):
    """Returns a dict of name -> count for documents indexed.

    For example:

    >>> get_doctype_stats()
    {'questions_question': 14216, 'forums_thread': 419, 'wiki_document': 759}

    :throws pyelasticsearch.exceptions.Timeout: if the request
        times out
    :throws pyelasticsearch.exceptions.ConnectionError: if there's a
        connection error
    :throws pyelasticsearch.exceptions.ElasticHttpNotFound: if the
        index doesn't exist

    """
    from search.models import get_search_models

    s = Sphilastic(object)

    stats = {}
    for cls in get_search_models():
        model_name = cls.get_model_name()
        stats[model_name] = s.filter(model=model_name).count()

    return stats
示例#7
0
def get_doctype_stats(index):
    """Returns a dict of name -> count for documents indexed.

    For example:

    >>> get_doctype_stats()
    {'questions_question': 14216, 'forums_thread': 419, 'wiki_document': 759}

    :throws pyelasticsearch.exceptions.Timeout: if the request
        times out
    :throws pyelasticsearch.exceptions.ConnectionError: if there's a
        connection error
    :throws pyelasticsearch.exceptions.ElasticHttpNotFound: if the
        index doesn't exist

    """
    from search.models import get_search_models

    s = Sphilastic(object)

    stats = {}
    for cls in get_search_models():
        model_name = cls.get_model_name()
        stats[model_name] = s.filter(model=model_name).count()

    return stats
示例#8
0
文件: admin.py 项目: LASarkar/kitsune
def index_view(request):
    requested_bucket = request.GET.get('bucket', '')
    requested_id = request.GET.get('id', '')
    last_20_by_bucket = None
    data = None

    bucket_to_model = dict(
        [(cls._meta.db_table, cls) for cls in get_search_models()])

    if requested_bucket and requested_id:
        # Nix whitespace because I keep accidentally picking up spaces
        # when I copy and paste.
        requested_id = requested_id.strip()

        # The user wants to see a specific item in the index, so we
        # attempt to fetch it from the index and show that
        # specifically.
        if requested_bucket not in bucket_to_model:
            raise Http404

        cls = bucket_to_model[requested_bucket]
        data = list(cls.search().filter(id=requested_id).values_dict())
        if not data:
            raise Http404
        data = _fix_value_dicts(data)[0]

    else:
        # Create a list of (class, list-of-dicts) showing us the most
        # recently indexed items for each bucket. We only display the
        # id, title and indexed_on fields, so only pull those back from
        # ES.
        last_20_by_bucket = [
            (cls_name,
             _fix_value_dicts(cls.search()
                                 .values_dict()
                                 .order_by('-indexed_on')[:20]))
            for cls_name, cls in bucket_to_model.items()]

    return render(
        request,
        'admin/search_index.html',
        {'title': 'Index Browsing',
         'buckets': [cls_name for cls_name, cls in bucket_to_model.items()],
         'last_20_by_bucket': last_20_by_bucket,
         'requested_bucket': requested_bucket,
         'requested_id': requested_id,
         'requested_data': data
         })
示例#9
0
文件: admin.py 项目: ibai/kitsune
def index_view(request):
    requested_bucket = request.GET.get('bucket', '')
    requested_id = request.GET.get('id', '')
    last_20_by_bucket = None
    data = None

    bucket_to_model = dict([(cls._meta.db_table, cls)
                            for cls in get_search_models()])

    if requested_bucket and requested_id:
        # Nix whitespace because I keep accidentally picking up spaces
        # when I copy and paste.
        requested_id = requested_id.strip()

        # The user wants to see a specific item in the index, so we
        # attempt to fetch it from the index and show that
        # specifically.
        if requested_bucket not in bucket_to_model:
            raise Http404

        cls = bucket_to_model[requested_bucket]
        data = list(cls.search().filter(id=requested_id).values_dict())
        if not data:
            raise Http404
        data = _fix_value_dicts(data)[0]

    else:
        # Create a list of (class, list-of-dicts) showing us the most
        # recently indexed items for each bucket. We only display the
        # id, title and indexed_on fields, so only pull those back from
        # ES.
        last_20_by_bucket = [
            (cls_name,
             _fix_value_dicts(
                 cls.search().values_dict().order_by('-indexed_on')[:20]))
            for cls_name, cls in bucket_to_model.items()
        ]

    return render_to_response(
        'search/admin/index.html', {
            'title': 'Index Browsing',
            'buckets': [cls_name for cls_name, cls in bucket_to_model.items()],
            'last_20_by_bucket': last_20_by_bucket,
            'requested_bucket': requested_bucket,
            'requested_id': requested_id,
            'requested_data': data
        }, RequestContext(request, {}))
示例#10
0
def index_view(request):
    requested_bucket = request.GET.get("bucket", "")
    requested_id = request.GET.get("id", "")
    last_20_by_bucket = None
    data = None

    bucket_to_model = dict([(cls._meta.db_table, cls) for cls in get_search_models()])

    if requested_bucket and requested_id:
        # Nix whitespace because I keep accidentally picking up spaces
        # when I copy and paste.
        requested_id = requested_id.strip()

        # The user wants to see a specific item in the index, so we
        # attempt to fetch it from the index and show that
        # specifically.
        if requested_bucket not in bucket_to_model:
            raise Http404

        cls = bucket_to_model[requested_bucket]
        data = list(cls.search().filter(id=requested_id).values_dict())
        if not data:
            raise Http404
        data = _fix_value_dicts(data)[0]

    else:
        # Create a list of (class, list-of-dicts) showing us the most
        # recently indexed items for each bucket. We only display the
        # id, title and indexed_on fields, so only pull those back from
        # ES.
        last_20_by_bucket = [
            (cls_name, _fix_value_dicts(cls.search().values_dict().order_by("-indexed_on")[:20]))
            for cls_name, cls in bucket_to_model.items()
        ]

    return render_to_response(
        "search/admin/index.html",
        {
            "title": "Index Browsing",
            "buckets": [cls_name for cls_name, cls in bucket_to_model.items()],
            "last_20_by_bucket": last_20_by_bucket,
            "requested_bucket": requested_bucket,
            "requested_id": requested_id,
            "requested_data": data,
        },
        RequestContext(request, {}),
    )
示例#11
0
    def handle(self, *args, **options):
        logging.basicConfig(level=logging.INFO)
        percent = options['percent']
        if percent > 100 or percent < 1:
            raise CommandError('percent should be between 1 and 100')

        if args:
            search_models = get_search_models()
            possible_doctypes = dict((cls._meta.db_table, cls)
                                     for cls in search_models)
            for mem in args:
                if mem not in possible_doctypes:
                    raise CommandError('"%s" is not a valid doctype (%s)' %
                                       (mem, possible_doctypes.keys()))

        # args are the list of doctypes to index.
        es_reindex(args, percent)
示例#12
0
def mapping_view(request):
    search_models = get_search_models()
    merged_mapping = {
        SUMO_DOCTYPE: {"properties": merge_mappings([(cls._meta.db_table, cls.get_mapping()) for cls in search_models])}
    }

    # TODO: This indents poorly and the results are hard to read.  I
    # think to do it better, we'd need to write our own pretty-printer
    # which isn't hard, but I'm pushing it off until we decide it's
    # necessary.
    merged_mapping = pformat(merged_mapping, indent=4)

    return render_to_response(
        "search/admin/mapping.html",
        {"title": "Mapping Browsing", "mapping": merged_mapping},
        RequestContext(request, {}),
    )
示例#13
0
def get_doctype_stats():
    """Returns a dict of name -> count for documents indexed.

    For example:

    >>> get_doctype_stats()
    {'questions_question': 14216, 'forums_thread': 419, 'wiki_document': 759}

    :throws pyes.urllib3.MaxRetryError: if it can't connect to elasticsearch
    :throws pyes.exceptions.IndexMissingException: if the index doesn't exist

    """
    from search.models import get_search_models

    stats = {}

    for cls in get_search_models():
        stats[cls._meta.db_table] = elasticutils.S(cls).count()

    return stats
示例#14
0
文件: admin.py 项目: ibai/kitsune
def mapping_view(request):
    search_models = get_search_models()
    merged_mapping = {
        SUMO_DOCTYPE: {
            'properties':
            merge_mappings([(cls._meta.db_table, cls.get_mapping())
                            for cls in search_models])
        }
    }

    # TODO: This indents poorly and the results are hard to read.  I
    # think to do it better, we'd need to write our own pretty-printer
    # which isn't hard, but I'm pushing it off until we decide it's
    # necessary.
    merged_mapping = pformat(merged_mapping, indent=4)

    return render_to_response('search/admin/mapping.html', {
        'title': 'Mapping Browsing',
        'mapping': merged_mapping
    }, RequestContext(request, {}))
示例#15
0
文件: es_utils.py 项目: klrmn/kitsune
def get_indexable(percent=100):
    """Returns a list of (class, iterable) for all the things to index

    :arg percent: Defaults to 100.  Allows you to specify how much of
        each doctype you want to index.  This is useful for
        development where doing a full reindex takes an hour.

    """
    from search.models import get_search_models

    search_models = get_search_models()
    to_index = []
    percent = float(percent) / 100
    for cls in search_models:
        indexable = cls.get_indexable()
        if percent < 1:
            indexable = indexable[:int(indexable.count() * percent)]
        to_index.append((cls, indexable))

    return to_index
示例#16
0
文件: admin.py 项目: LASarkar/kitsune
def mapping_view(request):
    search_models = get_search_models()
    merged_mapping = {
        SUMO_DOCTYPE: {
            'properties': merge_mappings(
                [(cls._meta.db_table, cls.get_mapping())
                 for cls in search_models])
            }
        }

    # TODO: This indents poorly and the results are hard to read.  I
    # think to do it better, we'd need to write our own pretty-printer
    # which isn't hard, but I'm pushing it off until we decide it's
    # necessary.
    merged_mapping = pformat(merged_mapping, indent=4)

    return render(
        request,
        'admin/search_mapping.html',
        {'title': 'Mapping Browsing',
         'mapping': merged_mapping
         })
示例#17
0
文件: es_utils.py 项目: ibai/kitsune
def get_indexable(percent=100, search_models=None):
    """Returns a list of (class, iterable) for all the things to index

    :arg percent: Defaults to 100.  Allows you to specify how much of
        each doctype you want to index.  This is useful for
        development where doing a full reindex takes an hour.

    """
    from search.models import get_search_models

    # Note: Passing in None will get all the models.
    search_models = get_search_models(search_models)

    to_index = []
    percent = float(percent) / 100
    for cls in search_models:
        indexable = cls.get_indexable()
        if percent < 1:
            indexable = indexable[:int(indexable.count() * percent)]
        to_index.append((cls, indexable))

    return to_index
示例#18
0
def get_doctype_stats(index):
    """Returns a dict of name -> count for documents indexed.

    For example:

    >>> get_doctype_stats()
    {'questions_question': 14216, 'forums_thread': 419, 'wiki_document': 759}

    :throws pyes.urllib3.MaxRetryError: if it can't connect to elasticsearch
    :throws pyes.exceptions.IndexMissingException: if the index doesn't exist

    """
    from search.models import get_search_models

    conn = get_indexing_es()

    stats = {}
    for cls in get_search_models():
        query = pyes.query.TermQuery("model", cls.get_model_name())
        results = conn.count(query=query, indexes=[index], doc_types=[SUMO_DOCTYPE])
        stats[cls.get_model_name()] = results[u"count"]

    return stats
示例#19
0
def get_doctype_stats(index):
    """Returns a dict of name -> count for documents indexed.

    For example:

    >>> get_doctype_stats()
    {'questions_question': 14216, 'forums_thread': 419, 'wiki_document': 759}

    :throws pyes.urllib3.MaxRetryError: if it can't connect to elasticsearch
    :throws pyes.exceptions.IndexMissingException: if the index doesn't exist

    """
    from search.models import get_search_models

    es = elasticutils.get_es()
    query = pyes.query.MatchAllQuery()

    stats = {}

    for cls in get_search_models():
        stats[cls._meta.db_table] = es.count(query, indexes=[index], doc_types=[cls._meta.db_table])["count"]

    return stats
示例#20
0
def search(request, template=None):
    """ES-specific search view"""

    # JSON-specific variables
    is_json = (request.GET.get('format') == 'json')
    callback = request.GET.get('callback', '').strip()
    mimetype = 'application/x-javascript' if callback else 'application/json'

    # Search "Expires" header format
    expires_fmt = '%A, %d %B %Y %H:%M:%S GMT'

    # Check callback is valid
    if is_json and callback and not jsonp_is_valid(callback):
        return HttpResponse(
            json.dumps({'error': _('Invalid callback function.')}),
            mimetype=mimetype, status=400)

    language = locale_or_default(request.GET.get('language', request.locale))
    r = request.GET.copy()
    a = request.GET.get('a', '0')

    # Search default values
    try:
        category = (map(int, r.getlist('category')) or
                    settings.SEARCH_DEFAULT_CATEGORIES)
    except ValueError:
        category = settings.SEARCH_DEFAULT_CATEGORIES
    r.setlist('category', category)

    # Basic form
    if a == '0':
        r['w'] = r.get('w', constants.WHERE_BASIC)
    # Advanced form
    if a == '2':
        r['language'] = language
        r['a'] = '1'

    # TODO: Rewrite so SearchForm is unbound initially and we can use
    # `initial` on the form fields.
    if 'include_archived' not in r:
        r['include_archived'] = False

    search_form = SearchForm(r)

    if not search_form.is_valid() or a == '2':
        if is_json:
            return HttpResponse(
                json.dumps({'error': _('Invalid search data.')}),
                mimetype=mimetype,
                status=400)

        t = template if request.MOBILE else 'search/form.html'
        search_ = jingo.render(request, t,
                               {'advanced': a, 'request': request,
                                'search_form': search_form})
        search_['Cache-Control'] = 'max-age=%s' % \
                                   (settings.SEARCH_CACHE_PERIOD * 60)
        search_['Expires'] = (datetime.utcnow() +
                              timedelta(
                                minutes=settings.SEARCH_CACHE_PERIOD)) \
                              .strftime(expires_fmt)
        return search_

    cleaned = search_form.cleaned_data

    if request.MOBILE and cleaned['w'] == constants.WHERE_BASIC:
        cleaned['w'] = constants.WHERE_WIKI

    page = max(smart_int(request.GET.get('page')), 1)
    offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE

    lang = language.lower()
    if settings.LANGUAGES.get(lang):
        lang_name = settings.LANGUAGES[lang]
    else:
        lang_name = ''

    # Woah! object?! Yeah, so what happens is that Sphilastic is
    # really an elasticutils.S and that requires a Django ORM model
    # argument. That argument only gets used if you want object
    # results--for every hit it gets back from ES, it creates an
    # object of the type of the Django ORM model you passed in. We use
    # object here to satisfy the need for a type in the constructor
    # and make sure we don't ever ask for object results.
    searcher = Sphilastic(object)

    wiki_f = F(model='wiki_document')
    question_f = F(model='questions_question')
    discussion_f = F(model='forums_thread')

    # Start - wiki filters

    if cleaned['w'] & constants.WHERE_WIKI:
        # Category filter
        if cleaned['category']:
            wiki_f &= F(document_category__in=cleaned['category'])

        # Locale filter
        wiki_f &= F(document_locale=language)

        # Product filter
        products = cleaned['product']
        for p in products:
            wiki_f &= F(document_product=p)

        # Topics filter
        topics = cleaned['topics']
        for t in topics:
            wiki_f &= F(document_topic=t)

        # Archived bit
        if a == '0' and not cleaned['include_archived']:
            # Default to NO for basic search:
            cleaned['include_archived'] = False
        if not cleaned['include_archived']:
            wiki_f &= F(document_is_archived=False)

    # End - wiki filters

    # Start - support questions filters

    if cleaned['w'] & constants.WHERE_SUPPORT:

        # Solved is set by default if using basic search
        if a == '0' and not cleaned['has_helpful']:
            cleaned['has_helpful'] = constants.TERNARY_YES

        # These filters are ternary, they can be either YES, NO, or OFF
        ternary_filters = ('is_locked', 'is_solved', 'has_answers',
                           'has_helpful')
        d = dict(('question_%s' % filter_name,
                  _ternary_filter(cleaned[filter_name]))
                 for filter_name in ternary_filters if cleaned[filter_name])
        if d:
            question_f &= F(**d)

        if cleaned['asked_by']:
            question_f &= F(question_creator=cleaned['asked_by'])

        if cleaned['answered_by']:
            question_f &= F(question_answer_creator=cleaned['answered_by'])

        q_tags = [t.strip() for t in cleaned['q_tags'].split(',')]
        for t in q_tags:
            if t:
                question_f &= F(question_tag=t)

    # End - support questions filters

    # Start - discussion forum filters

    if cleaned['w'] & constants.WHERE_DISCUSSION:
        if cleaned['author']:
            discussion_f &= F(post_author_ord=cleaned['author'])

        if cleaned['thread_type']:
            if constants.DISCUSSION_STICKY in cleaned['thread_type']:
                discussion_f &= F(post_is_sticky=1)

            if constants.DISCUSSION_LOCKED in cleaned['thread_type']:
                discussion_f &= F(post_is_locked=1)

        if cleaned['forum']:
            discussion_f &= F(post_forum_id__in=cleaned['forum'])

    # End - discussion forum filters

    # Created filter
    unix_now = int(time.time())
    interval_filters = (
        ('created', cleaned['created'], cleaned['created_date']),
        ('updated', cleaned['updated'], cleaned['updated_date']))
    for filter_name, filter_option, filter_date in interval_filters:
        if filter_option == constants.INTERVAL_BEFORE:
            before = {filter_name + '__gte': 0,
                      filter_name + '__lte': max(filter_date, 0)}

            discussion_f &= F(**before)
            question_f &= F(**before)
        elif filter_option == constants.INTERVAL_AFTER:
            after = {filter_name + '__gte': min(filter_date, unix_now),
                     filter_name + '__lte': unix_now}

            discussion_f &= F(**after)
            question_f &= F(**after)

    # In basic search, we limit questions from the last
    # SEARCH_DEFAULT_MAX_QUESTION_AGE seconds.
    if a == '0':
        start_date = unix_now - settings.SEARCH_DEFAULT_MAX_QUESTION_AGE
        question_f &= F(created__gte=start_date)

    # Note: num_voted (with a d) is a different field than num_votes
    # (with an s). The former is a dropdown and the latter is an
    # integer value.
    if cleaned['num_voted'] == constants.INTERVAL_BEFORE:
        question_f &= F(question_num_votes__lte=max(cleaned['num_votes'], 0))
    elif cleaned['num_voted'] == constants.INTERVAL_AFTER:
        question_f &= F(question_num_votes__gte=cleaned['num_votes'])

    # Done with all the filtery stuff--time  to generate results

    # Combine all the filters and add to the searcher
    final_filter = F()
    if cleaned['w'] & constants.WHERE_WIKI:
        final_filter |= wiki_f

    if cleaned['w'] & constants.WHERE_SUPPORT:
        final_filter |= question_f

    if cleaned['w'] & constants.WHERE_DISCUSSION:
        final_filter |= discussion_f

    searcher = searcher.filter(final_filter)

    if 'explain' in request.GET and request.GET['explain'] == '1':
        searcher = searcher.explain()

    documents = ComposedList()

    try:
        cleaned_q = cleaned['q']

        # Set up the highlights
        # First 500 characters of content in one big fragment
        searcher = searcher.highlight(
            'question_content', 'discussion_content', 'document_summary',
            pre_tags=['<b>'],
            post_tags=['</b>'],
            number_of_fragments=0,
            fragment_size=500)

        # Set up boosts
        searcher = searcher.boost(
            question_title=4.0,
            question_content=3.0,
            question_answer_content=3.0,
            post_title=2.0,
            post_content=1.0,
            document_title=6.0,
            document_content=1.0,
            document_keywords=4.0,
            document_summary=2.0,

            # Text phrases in document titles and content get an extra
            # boost.
            document_title__text_phrase=10.0,
            document_content__text_phrase=8.0)

        # Apply sortby for advanced search of questions
        if cleaned['w'] == constants.WHERE_SUPPORT:
            sortby = cleaned['sortby']
            try:
                searcher = searcher.order_by(
                    *constants.SORT_QUESTIONS[sortby])
            except IndexError:
                # Skip index errors because they imply the user is
                # sending us sortby values that aren't valid.
                pass

        # Apply sortby for advanced search of kb documents
        if cleaned['w'] == constants.WHERE_WIKI:
            sortby = cleaned['sortby_documents']
            try:
                searcher = searcher.order_by(
                    *constants.SORT_DOCUMENTS[sortby])
            except IndexError:
                # Skip index errors because they imply the user is
                # sending us sortby values that aren't valid.
                pass

        # Build the query
        if cleaned_q:
            query_fields = chain(*[cls.get_query_fields()
                                   for cls in get_search_models()])

            query = {}
            # Create text and text_phrase queries for every field
            # we want to search.
            for field in query_fields:
                for query_type in ['text', 'text_phrase']:
                    query['%s__%s' % (field, query_type)] = cleaned_q

            searcher = searcher.query(or_=query)

        num_results = min(searcher.count(), settings.SEARCH_MAX_RESULTS)

        # TODO - Can ditch the ComposedList here, but we need
        # something that paginate can use to figure out the paging.
        documents = ComposedList()
        documents.set_count(('results', searcher), num_results)

        results_per_page = settings.SEARCH_RESULTS_PER_PAGE
        pages = paginate(request, documents, results_per_page)

        # If we know there aren't any results, let's cheat and in
        # doing that, not hit ES again.
        if num_results == 0:
            searcher = []
        else:
            # Get the documents we want to show and add them to
            # docs_for_page
            documents = documents[offset:offset + results_per_page]

            if len(documents) == 0:
                # If the user requested a page that's beyond the
                # pagination, then documents is an empty list and
                # there are no results to show.
                searcher = []
            else:
                bounds = documents[0][1]
                searcher = searcher.values_dict()[bounds[0]:bounds[1]]

        results = []
        for i, doc in enumerate(searcher):
            rank = i + offset

            if doc['model'] == 'wiki_document':
                summary = _build_es_excerpt(doc)
                if not summary:
                    summary = doc['document_summary']
                result = {
                    'title': doc['document_title'],
                    'type': 'document'}

            elif doc['model'] == 'questions_question':
                summary = _build_es_excerpt(doc)
                if not summary:
                    # We're excerpting only question_content, so if
                    # the query matched question_title or
                    # question_answer_content, then there won't be any
                    # question_content excerpts. In that case, just
                    # show the question--but only the first 500
                    # characters.
                    summary = bleach.clean(
                        doc['question_content'], strip=True)[:500]

                result = {
                    'title': doc['question_title'],
                    'type': 'question',
                    'is_solved': doc['question_is_solved'],
                    'num_answers': doc['question_num_answers'],
                    'num_votes': doc['question_num_votes'],
                    'num_votes_past_week': doc['question_num_votes_past_week']}

            else:
                summary = _build_es_excerpt(doc)
                result = {
                    'title': doc['post_title'],
                    'type': 'thread'}

            result['url'] = doc['url']
            result['object'] = ObjectDict(doc)
            result['search_summary'] = summary
            result['rank'] = rank
            result['score'] = doc._score
            result['explanation'] = escape(format_explanation(
                    doc._explanation))
            results.append(result)

    except (ESTimeoutError, ESMaxRetryError, ESException), exc:
        # Handle timeout and all those other transient errors with a
        # "Search Unavailable" rather than a Django error page.
        if is_json:
            return HttpResponse(json.dumps({'error':
                                             _('Search Unavailable')}),
                                mimetype=mimetype, status=503)

        if isinstance(exc, ESTimeoutError):
            statsd.incr('search.esunified.timeouterror')
        elif isinstance(exc, ESMaxRetryError):
            statsd.incr('search.esunified.maxretryerror')
        elif isinstance(exc, ESException):
            statsd.incr('search.esunified.elasticsearchexception')

        import logging
        logging.exception(exc)

        t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html'
        return jingo.render(request, t, {'q': cleaned['q']}, status=503)
示例#21
0
def search_with_es_unified(request, template=None):
    """ES-specific search view"""

    # Time ES and Sphinx separate. See bug 723930.
    # TODO: Remove this once Sphinx is gone.
    start = time.time()

    # JSON-specific variables
    is_json = (request.GET.get('format') == 'json')
    callback = request.GET.get('callback', '').strip()
    mimetype = 'application/x-javascript' if callback else 'application/json'

    # Search "Expires" header format
    expires_fmt = '%A, %d %B %Y %H:%M:%S GMT'

    # Check callback is valid
    if is_json and callback and not jsonp_is_valid(callback):
        return HttpResponse(
            json.dumps({'error': _('Invalid callback function.')}),
            mimetype=mimetype, status=400)

    language = locale_or_default(request.GET.get('language', request.locale))
    r = request.GET.copy()
    a = request.GET.get('a', '0')

    # Search default values
    try:
        category = (map(int, r.getlist('category')) or
                    settings.SEARCH_DEFAULT_CATEGORIES)
    except ValueError:
        category = settings.SEARCH_DEFAULT_CATEGORIES
    r.setlist('category', category)

    # Basic form
    if a == '0':
        r['w'] = r.get('w', constants.WHERE_BASIC)
    # Advanced form
    if a == '2':
        r['language'] = language
        r['a'] = '1'

    # TODO: Rewrite so SearchForm is unbound initially and we can use
    # `initial` on the form fields.
    if 'include_archived' not in r:
        r['include_archived'] = False

    search_form = SearchForm(r)

    if not search_form.is_valid() or a == '2':
        if is_json:
            return HttpResponse(
                json.dumps({'error': _('Invalid search data.')}),
                mimetype=mimetype,
                status=400)

        t = template if request.MOBILE else 'search/form.html'
        search_ = jingo.render(request, t,
                               {'advanced': a, 'request': request,
                                'search_form': search_form})
        search_['Cache-Control'] = 'max-age=%s' % \
                                   (settings.SEARCH_CACHE_PERIOD * 60)
        search_['Expires'] = (datetime.utcnow() +
                              timedelta(
                                minutes=settings.SEARCH_CACHE_PERIOD)) \
                              .strftime(expires_fmt)
        return search_

    cleaned = search_form.cleaned_data

    page = max(smart_int(request.GET.get('page')), 1)
    offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE

    lang = language.lower()
    if settings.LANGUAGES.get(lang):
        lang_name = settings.LANGUAGES[lang]
    else:
        lang_name = ''

    # Woah! object?! Yeah, so what happens is that Sphilastic is
    # really an elasticutils.S and that requires a Django ORM model
    # argument. That argument only gets used if you want object
    # results--for every hit it gets back from ES, it creates an
    # object of the type of the Django ORM model you passed in. We use
    # object here to satisfy the need for a type in the constructor
    # and make sure we don't ever ask for object results.
    searcher = Sphilastic(object)

    wiki_f = F()
    question_f = F()
    discussion_f = F()

    # Start - wiki filters

    if cleaned['w'] & constants.WHERE_WIKI:
        # Category filter
        if cleaned['category']:
            wiki_f &= F(document_category__in=cleaned['category'])

        # Locale filter
        wiki_f &= F(document_locale=language)

        # Product filter
        products = cleaned['product']
        for p in products:
            wiki_f &= F(document_tag=p)

        # Tags filter
        tags = [t.strip() for t in cleaned['tags'].split()]
        for t in tags:
            wiki_f &= F(document_tag=t)

        # Archived bit
        if a == '0' and not cleaned['include_archived']:
            # Default to NO for basic search:
            cleaned['include_archived'] = False
        if not cleaned['include_archived']:
            wiki_f &= F(document_is_archived=False)

    # End - wiki filters

    # Start - support questions filters

    if cleaned['w'] & constants.WHERE_SUPPORT:

        # Solved is set by default if using basic search
        if a == '0' and not cleaned['has_helpful']:
            cleaned['has_helpful'] = constants.TERNARY_YES

        # These filters are ternary, they can be either YES, NO, or OFF
        ternary_filters = ('is_locked', 'is_solved', 'has_answers',
                           'has_helpful')
        d = dict(('question_%s' % filter_name,
                  _ternary_filter(cleaned[filter_name]))
                 for filter_name in ternary_filters if cleaned[filter_name])
        if d:
            question_f &= F(**d)

        if cleaned['asked_by']:
            question_f &= F(question_creator=cleaned['asked_by'])

        if cleaned['answered_by']:
            question_f &= F(question_answer_creator=cleaned['answered_by'])

        q_tags = [t.strip() for t in cleaned['q_tags'].split()]
        for t in q_tags:
            question_f &= F(question_tag=t)

    # End - support questions filters

    # Start - discussion forum filters

    if cleaned['w'] & constants.WHERE_DISCUSSION:
        if cleaned['author']:
            discussion_f &= F(post_author_ord=cleaned['author'])

        if cleaned['thread_type']:
            if constants.DISCUSSION_STICKY in cleaned['thread_type']:
                discussion_f &= F(post_is_sticky=1)

            if constants.DISCUSSION_LOCKED in cleaned['thread_type']:
                discussion_f &= F(post_is_locked=1)

        if cleaned['forum']:
            discussion_f &= F(post_form_id__in=cleaned['forum'])

    # End - discussion forum filters

    # Created filter
    unix_now = int(time.time())
    interval_filters = (
        ('created', cleaned['created'], cleaned['created_date']),
        ('updated', cleaned['updated'], cleaned['updated_date']))
    for filter_name, filter_option, filter_date in interval_filters:
        if filter_option == constants.INTERVAL_BEFORE:
            before = {filter_name + '__gte': 0,
                      filter_name + '__lte': max(filter_date, 0)}

            discussion_f &= F(**before)
            question_f &= F(**before)
        elif filter_option == constants.INTERVAL_AFTER:
            after = {filter_name + '__gte': min(filter_date, unix_now),
                     filter_name + '__lte': unix_now}

            discussion_f &= F(**after)
            question_f &= F(**after)

    # Note: num_voted (with a d) is a different field than num_votes
    # (with an s). The former is a dropdown and the latter is an
    # integer value.
    if cleaned['num_voted'] == constants.INTERVAL_BEFORE:
        question_f &= F(question_num_votes__lte=max(cleaned['num_votes'], 0))
    elif cleaned['num_voted'] == constants.INTERVAL_AFTER:
        question_f &= F(question_num_votes__gte=cleaned['num_votes'])

    # Done with all the filtery stuff--time  to generate results

    documents = ComposedList()
    try:
        cleaned_q = cleaned['q']

        # Add all the filters
        searcher = searcher.filter(question_f | wiki_f | discussion_f)

        # Set up the highlights
        searcher = searcher.highlight(
            'question_title', 'question_content', 'question_answer_content',
            'discussion_content',
            before_match='<b>',
            after_match='</b>',
            limit=settings.SEARCH_SUMMARY_LENGTH)

        # Set up weights
        searcher = searcher.weight(
            question_title__text=4, question_content__text=3,
            question_answer_content__text=3,
            post_title__text=2, post_content__text=1,
            document_title__text=6, document_content__text=1,
            document_keywords__text=4, document_summary__text=2)

        # Apply sortby, but only for advanced search for questions
        if a == '1' and cleaned['w'] & constants.WHERE_SUPPORT:
            sortby = smart_int(request.GET.get('sortby'))
            try:
                searcher = searcher.order_by(
                    *constants.SORT_QUESTIONS_ES[sortby])
            except IndexError:
                # Skip index errors because they imply the user is
                # sending us sortby values that aren't valid.
                pass

        # Build the query
        if cleaned_q:
            query_fields = chain(*[cls.get_query_fields()
                                   for cls in get_search_models()])

            query = dict((field, cleaned_q) for field in query_fields)

            searcher = searcher.query(or_=query)

        # TODO - Can ditch the ComposedList here, but we need
        # something that paginate can use to figure out the paging.
        documents = ComposedList()
        documents.set_count(('results', searcher),
                            min(searcher.count(), settings.SEARCH_MAX_RESULTS))

        results_per_page = settings.SEARCH_RESULTS_PER_PAGE
        pages = paginate(request, documents, results_per_page)
        num_results = len(documents)

        # Get the documents we want to show and add them to
        # docs_for_page
        documents = documents[offset:offset + results_per_page]

        bounds = documents[0][1]
        searcher = searcher.values_dict()[bounds[0]:bounds[1]]

        results = []
        for i, doc in enumerate(searcher):
            rank = i + offset

            if doc['model'] == 'wiki_document':
                summary = doc['document_summary']
                result = {
                    'title': doc['document_title'],
                    'type': 'document'}

            elif doc['model'] == 'questions_question':
                summary = _build_es_excerpt(doc)
                result = {
                    'title': doc['question_title'],
                    'type': 'question',
                    'is_solved': doc['question_is_solved'],
                    'num_answers': doc['question_num_answers'],
                    'num_votes': doc['question_num_votes'],
                    'num_votes_past_week': doc['question_num_votes_past_week']}

            else:
                summary = _build_es_excerpt(doc)
                result = {
                    'title': doc['post_title'],
                    'type': 'thread'}

            result['url'] = doc['url']
            result['object'] = ObjectDict(doc)
            result['search_summary'] = summary
            result['rank'] = rank
            result['score'] = doc._score
            results.append(result)

    except (ESTimeoutError, ESMaxRetryError, ESException), exc:
        # Handle timeout and all those other transient errors with a
        # "Search Unavailable" rather than a Django error page.
        if is_json:
            return HttpResponse(json.dumps({'error':
                                             _('Search Unavailable')}),
                                mimetype=mimetype, status=503)

        if isinstance(exc, ESTimeoutError):
            statsd.incr('search.%s.timeouterror.elastic.unified')
        elif isinstance(exc, ESMaxRetryError):
            statsd.incr('search.%s.maxretryerror.elastic.unified')
        elif isinstance(exc, ESException):
            statsd.incr('search.%s.elasticsearchexception.elastic.unified')

        import logging
        logging.exception(exc)

        t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html'
        return jingo.render(request, t, {'q': cleaned['q']}, status=503)
示例#22
0
文件: views.py 项目: bituka/kitsune
def search(request, template=None):
    """ES-specific search view"""

    # JSON-specific variables
    is_json = (request.GET.get('format') == 'json')
    callback = request.GET.get('callback', '').strip()
    mimetype = 'application/x-javascript' if callback else 'application/json'

    # Search "Expires" header format
    expires_fmt = '%A, %d %B %Y %H:%M:%S GMT'

    # Check callback is valid
    if is_json and callback and not jsonp_is_valid(callback):
        return HttpResponse(
            json.dumps({'error': _('Invalid callback function.')}),
            mimetype=mimetype, status=400)

    language = locale_or_default(
        request.GET.get('language', request.LANGUAGE_CODE))
    r = request.GET.copy()
    a = request.GET.get('a', '0')

    # Search default values
    try:
        category = (map(int, r.getlist('category')) or
                    settings.SEARCH_DEFAULT_CATEGORIES)
    except ValueError:
        category = settings.SEARCH_DEFAULT_CATEGORIES
    r.setlist('category', category)

    # Basic form
    if a == '0':
        r['w'] = r.get('w', constants.WHERE_BASIC)
    # Advanced form
    if a == '2':
        r['language'] = language
        r['a'] = '1'

    # TODO: Rewrite so SearchForm is unbound initially and we can use
    # `initial` on the form fields.
    if 'include_archived' not in r:
        r['include_archived'] = False

    search_form = SearchForm(r)

    if not search_form.is_valid() or a == '2':
        if is_json:
            return HttpResponse(
                json.dumps({'error': _('Invalid search data.')}),
                mimetype=mimetype,
                status=400)

        t = template if request.MOBILE else 'search/form.html'
        search_ = render(request, t, {
            'advanced': a, 'request': request,
            'search_form': search_form})
        search_['Cache-Control'] = 'max-age=%s' % \
                                   (settings.SEARCH_CACHE_PERIOD * 60)
        search_['Expires'] = (datetime.utcnow() +
                              timedelta(
                                minutes=settings.SEARCH_CACHE_PERIOD)) \
                              .strftime(expires_fmt)
        return search_

    cleaned = search_form.cleaned_data

    if request.MOBILE and cleaned['w'] == constants.WHERE_BASIC:
        cleaned['w'] = constants.WHERE_WIKI

    page = max(smart_int(request.GET.get('page')), 1)
    offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE

    lang = language.lower()
    if settings.LANGUAGES.get(lang):
        lang_name = settings.LANGUAGES[lang]
    else:
        lang_name = ''

    # Woah! object?! Yeah, so what happens is that Sphilastic is
    # really an elasticutils.S and that requires a Django ORM model
    # argument. That argument only gets used if you want object
    # results--for every hit it gets back from ES, it creates an
    # object of the type of the Django ORM model you passed in. We use
    # object here to satisfy the need for a type in the constructor
    # and make sure we don't ever ask for object results.
    searcher = Sphilastic(object)

    wiki_f = F(model='wiki_document')
    question_f = F(model='questions_question')
    discussion_f = F(model='forums_thread')

    # Start - wiki filters

    if cleaned['w'] & constants.WHERE_WIKI:
        # Category filter
        if cleaned['category']:
            wiki_f &= F(document_category__in=cleaned['category'])

        # Locale filter
        wiki_f &= F(document_locale=language)

        # Product filter
        products = cleaned['product']
        for p in products:
            wiki_f &= F(product=p)

        # Topics filter
        topics = cleaned['topics']
        for t in topics:
            wiki_f &= F(topic=t)

        # Archived bit
        if a == '0' and not cleaned['include_archived']:
            # Default to NO for basic search:
            cleaned['include_archived'] = False
        if not cleaned['include_archived']:
            wiki_f &= F(document_is_archived=False)

    # End - wiki filters

    # Start - support questions filters

    if cleaned['w'] & constants.WHERE_SUPPORT:

        # Solved is set by default if using basic search
        if a == '0' and not cleaned['has_helpful']:
            cleaned['has_helpful'] = constants.TERNARY_YES

        # These filters are ternary, they can be either YES, NO, or OFF
        ternary_filters = ('is_locked', 'is_solved', 'has_answers',
                           'has_helpful')
        d = dict(('question_%s' % filter_name,
                  _ternary_filter(cleaned[filter_name]))
                 for filter_name in ternary_filters if cleaned[filter_name])
        if d:
            question_f &= F(**d)

        if cleaned['asked_by']:
            question_f &= F(question_creator=cleaned['asked_by'])

        if cleaned['answered_by']:
            question_f &= F(question_answer_creator=cleaned['answered_by'])

        q_tags = [t.strip() for t in cleaned['q_tags'].split(',')]
        for t in q_tags:
            if t:
                question_f &= F(question_tag=t)

        # Product filter
        products = cleaned['product']
        for p in products:
            question_f &= F(product=p)

        # Topics filter
        topics = cleaned['topics']
        for t in topics:
            question_f &= F(topic=t)

    # End - support questions filters

    # Start - discussion forum filters

    if cleaned['w'] & constants.WHERE_DISCUSSION:
        if cleaned['author']:
            discussion_f &= F(post_author_ord=cleaned['author'])

        if cleaned['thread_type']:
            if constants.DISCUSSION_STICKY in cleaned['thread_type']:
                discussion_f &= F(post_is_sticky=1)

            if constants.DISCUSSION_LOCKED in cleaned['thread_type']:
                discussion_f &= F(post_is_locked=1)

        if cleaned['forum']:
            discussion_f &= F(post_forum_id__in=cleaned['forum'])

    # End - discussion forum filters

    # Created filter
    unix_now = int(time.time())
    interval_filters = (
        ('created', cleaned['created'], cleaned['created_date']),
        ('updated', cleaned['updated'], cleaned['updated_date']))
    for filter_name, filter_option, filter_date in interval_filters:
        if filter_option == constants.INTERVAL_BEFORE:
            before = {filter_name + '__gte': 0,
                      filter_name + '__lte': max(filter_date, 0)}

            discussion_f &= F(**before)
            question_f &= F(**before)
        elif filter_option == constants.INTERVAL_AFTER:
            after = {filter_name + '__gte': min(filter_date, unix_now),
                     filter_name + '__lte': unix_now}

            discussion_f &= F(**after)
            question_f &= F(**after)

    # In basic search, we limit questions from the last
    # SEARCH_DEFAULT_MAX_QUESTION_AGE seconds.
    if a == '0':
        start_date = unix_now - settings.SEARCH_DEFAULT_MAX_QUESTION_AGE
        question_f &= F(created__gte=start_date)

    # Note: num_voted (with a d) is a different field than num_votes
    # (with an s). The former is a dropdown and the latter is an
    # integer value.
    if cleaned['num_voted'] == constants.INTERVAL_BEFORE:
        question_f &= F(question_num_votes__lte=max(cleaned['num_votes'], 0))
    elif cleaned['num_voted'] == constants.INTERVAL_AFTER:
        question_f &= F(question_num_votes__gte=cleaned['num_votes'])

    # Done with all the filtery stuff--time  to generate results

    # Combine all the filters and add to the searcher
    final_filter = F()
    if cleaned['w'] & constants.WHERE_WIKI:
        final_filter |= wiki_f

    if cleaned['w'] & constants.WHERE_SUPPORT:
        final_filter |= question_f

    if cleaned['w'] & constants.WHERE_DISCUSSION:
        final_filter |= discussion_f

    searcher = searcher.filter(final_filter)

    if 'explain' in request.GET and request.GET['explain'] == '1':
        searcher = searcher.explain()

    documents = ComposedList()

    try:
        cleaned_q = cleaned['q']

        # Set up the highlights
        # First 500 characters of content in one big fragment
        searcher = searcher.highlight(
            'question_content', 'discussion_content', 'document_summary',
            pre_tags=['<b>'],
            post_tags=['</b>'],
            number_of_fragments=0,
            fragment_size=500)

        # Set up boosts
        searcher = searcher.boost(
            question_title=4.0,
            question_content=3.0,
            question_answer_content=3.0,
            post_title=2.0,
            post_content=1.0,
            document_title=6.0,
            document_content=1.0,
            document_keywords=8.0,
            document_summary=2.0,

            # Text phrases in document titles and content get an extra
            # boost.
            document_title__text_phrase=10.0,
            document_content__text_phrase=8.0)

        # Apply sortby for advanced search of questions
        if cleaned['w'] == constants.WHERE_SUPPORT:
            sortby = cleaned['sortby']
            try:
                searcher = searcher.order_by(
                    *constants.SORT_QUESTIONS[sortby])
            except IndexError:
                # Skip index errors because they imply the user is
                # sending us sortby values that aren't valid.
                pass

        # Apply sortby for advanced search of kb documents
        if cleaned['w'] == constants.WHERE_WIKI:
            sortby = cleaned['sortby_documents']
            try:
                searcher = searcher.order_by(
                    *constants.SORT_DOCUMENTS[sortby])
            except IndexError:
                # Skip index errors because they imply the user is
                # sending us sortby values that aren't valid.
                pass

        # Build the query
        if cleaned_q:
            query_fields = chain(*[cls.get_query_fields()
                                   for cls in get_search_models()])

            query = {}
            # Create text and text_phrase queries for every field
            # we want to search.
            for field in query_fields:
                for query_type in ['text', 'text_phrase']:
                    query['%s__%s' % (field, query_type)] = cleaned_q

            searcher = searcher.query(or_=query)

        num_results = min(searcher.count(), settings.SEARCH_MAX_RESULTS)

        # TODO - Can ditch the ComposedList here, but we need
        # something that paginate can use to figure out the paging.
        documents = ComposedList()
        documents.set_count(('results', searcher), num_results)

        results_per_page = settings.SEARCH_RESULTS_PER_PAGE
        pages = paginate(request, documents, results_per_page)

        # Facets
        product_facets = {}

        # If we know there aren't any results, let's cheat and in
        # doing that, not hit ES again.
        if num_results == 0:
            searcher = []
        else:
            # Get the documents we want to show and add them to
            # docs_for_page
            documents = documents[offset:offset + results_per_page]

            if len(documents) == 0:
                # If the user requested a page that's beyond the
                # pagination, then documents is an empty list and
                # there are no results to show.
                searcher = []
            else:
                bounds = documents[0][1]
                searcher = searcher.values_dict()[bounds[0]:bounds[1]]

                # If we are doing basic search, we show product facets.
                if a == '0':
                    pfc = searcher.facet(
                        'product', filtered=True).facet_counts()
                    product_facets = dict(
                        [(p['term'], p['count']) for p in pfc['product']])

        results = []
        for i, doc in enumerate(searcher):
            rank = i + offset

            if doc['model'] == 'wiki_document':
                summary = _build_es_excerpt(doc)
                if not summary:
                    summary = doc['document_summary']
                result = {
                    'title': doc['document_title'],
                    'type': 'document'}

            elif doc['model'] == 'questions_question':
                summary = _build_es_excerpt(doc)
                if not summary:
                    # We're excerpting only question_content, so if
                    # the query matched question_title or
                    # question_answer_content, then there won't be any
                    # question_content excerpts. In that case, just
                    # show the question--but only the first 500
                    # characters.
                    summary = bleach.clean(
                        doc['question_content'], strip=True)[:500]

                result = {
                    'title': doc['question_title'],
                    'type': 'question',
                    'is_solved': doc['question_is_solved'],
                    'num_answers': doc['question_num_answers'],
                    'num_votes': doc['question_num_votes'],
                    'num_votes_past_week': doc['question_num_votes_past_week']}

            else:
                summary = _build_es_excerpt(doc)
                result = {
                    'title': doc['post_title'],
                    'type': 'thread'}

            result['url'] = doc['url']
            result['object'] = ObjectDict(doc)
            result['search_summary'] = summary
            result['rank'] = rank
            result['score'] = doc._score
            result['explanation'] = escape(format_explanation(
                    doc._explanation))
            results.append(result)

    except ES_EXCEPTIONS as exc:
        # Handle timeout and all those other transient errors with a
        # "Search Unavailable" rather than a Django error page.
        if is_json:
            return HttpResponse(json.dumps({'error':
                                             _('Search Unavailable')}),
                                mimetype=mimetype, status=503)

        # Cheating here: Convert from 'Timeout()' to 'timeout' so
        # we have less code, but still have good stats.
        exc_bucket = repr(exc).lower().strip('()')
        statsd.incr('search.esunified.{0}'.format(exc_bucket))

        import logging
        logging.exception(exc)

        t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html'
        return render(request, t, {'q': cleaned['q']}, status=503)

    items = [(k, v) for k in search_form.fields for
             v in r.getlist(k) if v and k != 'a']
    items.append(('a', '2'))

    if is_json:
        # Models are not json serializable.
        for r in results:
            del r['object']
        data = {}
        data['results'] = results
        data['total'] = len(results)
        data['query'] = cleaned['q']
        if not results:
            data['message'] = _('No pages matched the search criteria')
        json_data = json.dumps(data)
        if callback:
            json_data = callback + '(' + json_data + ');'

        return HttpResponse(json_data, mimetype=mimetype)

    fallback_results = None
    if num_results == 0:
        fallback_results = _fallback_results(language, cleaned['product'])

    results_ = render(request, template, {
        'num_results': num_results,
        'results': results,
        'fallback_results': fallback_results,
        'q': cleaned['q'],
        'w': cleaned['w'],
        'product': cleaned['product'],
        'products': Product.objects.filter(visible=True),
        'product_facets': product_facets,
        'pages': pages,
        'search_form': search_form,
        'lang_name': lang_name, })
    results_['Cache-Control'] = 'max-age=%s' % \
                                (settings.SEARCH_CACHE_PERIOD * 60)
    results_['Expires'] = (datetime.utcnow() +
                           timedelta(minutes=settings.SEARCH_CACHE_PERIOD)) \
                           .strftime(expires_fmt)
    results_.set_cookie(settings.LAST_SEARCH_COOKIE, urlquote(cleaned['q']),
                        max_age=3600, secure=False, httponly=False)

    return results_
示例#23
0
文件: views.py 项目: ibai/kitsune
def search(request, template=None):
    """ES-specific search view"""

    # JSON-specific variables
    is_json = (request.GET.get('format') == 'json')
    callback = request.GET.get('callback', '').strip()
    mimetype = 'application/x-javascript' if callback else 'application/json'

    # Search "Expires" header format
    expires_fmt = '%A, %d %B %Y %H:%M:%S GMT'

    # Check callback is valid
    if is_json and callback and not jsonp_is_valid(callback):
        return HttpResponse(
            json.dumps({'error': _('Invalid callback function.')}),
            mimetype=mimetype, status=400)

    language = locale_or_default(request.GET.get('language', request.locale))
    r = request.GET.copy()
    a = request.GET.get('a', '0')

    # Search default values
    try:
        category = (map(int, r.getlist('category')) or
                    settings.SEARCH_DEFAULT_CATEGORIES)
    except ValueError:
        category = settings.SEARCH_DEFAULT_CATEGORIES
    r.setlist('category', category)

    # Basic form
    if a == '0':
        r['w'] = r.get('w', constants.WHERE_BASIC)
    # Advanced form
    if a == '2':
        r['language'] = language
        r['a'] = '1'

    # TODO: Rewrite so SearchForm is unbound initially and we can use
    # `initial` on the form fields.
    if 'include_archived' not in r:
        r['include_archived'] = False

    search_form = SearchForm(r)

    if not search_form.is_valid() or a == '2':
        if is_json:
            return HttpResponse(
                json.dumps({'error': _('Invalid search data.')}),
                mimetype=mimetype,
                status=400)

        t = template if request.MOBILE else 'search/form.html'
        search_ = jingo.render(request, t,
                               {'advanced': a, 'request': request,
                                'search_form': search_form})
        search_['Cache-Control'] = 'max-age=%s' % \
                                   (settings.SEARCH_CACHE_PERIOD * 60)
        search_['Expires'] = (datetime.utcnow() +
                              timedelta(
                                minutes=settings.SEARCH_CACHE_PERIOD)) \
                              .strftime(expires_fmt)
        return search_

    cleaned = search_form.cleaned_data

    page = max(smart_int(request.GET.get('page')), 1)
    offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE

    lang = language.lower()
    if settings.LANGUAGES.get(lang):
        lang_name = settings.LANGUAGES[lang]
    else:
        lang_name = ''

    # Woah! object?! Yeah, so what happens is that Sphilastic is
    # really an elasticutils.S and that requires a Django ORM model
    # argument. That argument only gets used if you want object
    # results--for every hit it gets back from ES, it creates an
    # object of the type of the Django ORM model you passed in. We use
    # object here to satisfy the need for a type in the constructor
    # and make sure we don't ever ask for object results.
    searcher = Sphilastic(object)

    wiki_f = F(model='wiki_document')
    question_f = F(model='questions_question')
    discussion_f = F(model='forums_thread')

    # Start - wiki filters

    if cleaned['w'] & constants.WHERE_WIKI:
        # Category filter
        if cleaned['category']:
            wiki_f &= F(document_category__in=cleaned['category'])

        # Locale filter
        wiki_f &= F(document_locale=language)

        # Product filter
        products = cleaned['product']
        for p in products:
            wiki_f &= F(document_product=p)

        # Topics filter
        topics = cleaned['topics']
        for t in topics:
            wiki_f &= F(document_topic=t)

        # Archived bit
        if a == '0' and not cleaned['include_archived']:
            # Default to NO for basic search:
            cleaned['include_archived'] = False
        if not cleaned['include_archived']:
            wiki_f &= F(document_is_archived=False)

    # End - wiki filters

    # Start - support questions filters

    if cleaned['w'] & constants.WHERE_SUPPORT:

        # Solved is set by default if using basic search
        if a == '0' and not cleaned['has_helpful']:
            cleaned['has_helpful'] = constants.TERNARY_YES

        # These filters are ternary, they can be either YES, NO, or OFF
        ternary_filters = ('is_locked', 'is_solved', 'has_answers',
                           'has_helpful')
        d = dict(('question_%s' % filter_name,
                  _ternary_filter(cleaned[filter_name]))
                 for filter_name in ternary_filters if cleaned[filter_name])
        if d:
            question_f &= F(**d)

        if cleaned['asked_by']:
            question_f &= F(question_creator=cleaned['asked_by'])

        if cleaned['answered_by']:
            question_f &= F(question_answer_creator=cleaned['answered_by'])

        q_tags = [t.strip() for t in cleaned['q_tags'].split(',')]
        for t in q_tags:
            if t:
                question_f &= F(question_tag=t)

    # End - support questions filters

    # Start - discussion forum filters

    if cleaned['w'] & constants.WHERE_DISCUSSION:
        if cleaned['author']:
            discussion_f &= F(post_author_ord=cleaned['author'])

        if cleaned['thread_type']:
            if constants.DISCUSSION_STICKY in cleaned['thread_type']:
                discussion_f &= F(post_is_sticky=1)

            if constants.DISCUSSION_LOCKED in cleaned['thread_type']:
                discussion_f &= F(post_is_locked=1)

        if cleaned['forum']:
            discussion_f &= F(post_forum_id__in=cleaned['forum'])

    # End - discussion forum filters

    # Created filter
    unix_now = int(time.time())
    interval_filters = (
        ('created', cleaned['created'], cleaned['created_date']),
        ('updated', cleaned['updated'], cleaned['updated_date']))
    for filter_name, filter_option, filter_date in interval_filters:
        if filter_option == constants.INTERVAL_BEFORE:
            before = {filter_name + '__gte': 0,
                      filter_name + '__lte': max(filter_date, 0)}

            discussion_f &= F(**before)
            question_f &= F(**before)
        elif filter_option == constants.INTERVAL_AFTER:
            after = {filter_name + '__gte': min(filter_date, unix_now),
                     filter_name + '__lte': unix_now}

            discussion_f &= F(**after)
            question_f &= F(**after)

    # Note: num_voted (with a d) is a different field than num_votes
    # (with an s). The former is a dropdown and the latter is an
    # integer value.
    if cleaned['num_voted'] == constants.INTERVAL_BEFORE:
        question_f &= F(question_num_votes__lte=max(cleaned['num_votes'], 0))
    elif cleaned['num_voted'] == constants.INTERVAL_AFTER:
        question_f &= F(question_num_votes__gte=cleaned['num_votes'])

    # Done with all the filtery stuff--time  to generate results

    # Combine all the filters and add to the searcher
    final_filter = F()
    if cleaned['w'] & constants.WHERE_WIKI:
        final_filter |= wiki_f

    if cleaned['w'] & constants.WHERE_SUPPORT:
        final_filter |= question_f

    if cleaned['w'] & constants.WHERE_DISCUSSION:
        final_filter |= discussion_f

    searcher = searcher.filter(final_filter)

    if 'explain' in request.GET and request.GET['explain'] == '1':
        searcher = searcher.explain()

    documents = ComposedList()
    try:
        cleaned_q = cleaned['q']

        # Set up the highlights
        searcher = searcher.highlight(
            'question_title', 'question_content', 'question_answer_content',
            'discussion_content',
            pre_tags=['<b>'],
            post_tags=['</b>'],
            fragment_size=settings.SEARCH_SUMMARY_LENGTH)

        # Set up boosts
        searcher = searcher.boost(
            question_title=4.0,
            question_content=3.0,
            question_answer_content=3.0,
            post_title=2.0,
            post_content=1.0,
            document_title=6.0,
            document_content=1.0,
            document_keywords=4.0,
            document_summary=2.0)

        # Apply sortby, but only for advanced search for questions
        if a == '1' and cleaned['w'] & constants.WHERE_SUPPORT:
            sortby = smart_int(request.GET.get('sortby'))
            try:
                searcher = searcher.order_by(
                    *constants.SORT_QUESTIONS[sortby])
            except IndexError:
                # Skip index errors because they imply the user is
                # sending us sortby values that aren't valid.
                pass

        # Build the query
        if cleaned_q:
            query_fields = chain(*[cls.get_query_fields()
                                   for cls in get_search_models()])

            query = dict((field, cleaned_q) for field in query_fields)

            searcher = searcher.query(or_=query)

        num_results = min(searcher.count(), settings.SEARCH_MAX_RESULTS)

        # TODO - Can ditch the ComposedList here, but we need
        # something that paginate can use to figure out the paging.
        documents = ComposedList()
        documents.set_count(('results', searcher), num_results)

        results_per_page = settings.SEARCH_RESULTS_PER_PAGE
        pages = paginate(request, documents, results_per_page)

        # If we know there aren't any results, let's cheat and in
        # doing that, not hit ES again.
        if num_results == 0:
            searcher = []
        else:
            # Get the documents we want to show and add them to
            # docs_for_page
            documents = documents[offset:offset + results_per_page]

            if len(documents) == 0:
                # If the user requested a page that's beyond the
                # pagination, then documents is an empty list and
                # there are no results to show.
                searcher = []
            else:
                bounds = documents[0][1]
                searcher = searcher.values_dict()[bounds[0]:bounds[1]]

        results = []
        for i, doc in enumerate(searcher):
            rank = i + offset

            if doc['model'] == 'wiki_document':
                summary = doc['document_summary']
                result = {
                    'title': doc['document_title'],
                    'type': 'document'}

            elif doc['model'] == 'questions_question':
                summary = _build_es_excerpt(doc)
                result = {
                    'title': doc['question_title'],
                    'type': 'question',
                    'is_solved': doc['question_is_solved'],
                    'num_answers': doc['question_num_answers'],
                    'num_votes': doc['question_num_votes'],
                    'num_votes_past_week': doc['question_num_votes_past_week']}

            else:
                summary = _build_es_excerpt(doc)
                result = {
                    'title': doc['post_title'],
                    'type': 'thread'}

            result['url'] = doc['url']
            result['object'] = ObjectDict(doc)
            result['search_summary'] = summary
            result['rank'] = rank
            result['score'] = doc._score
            result['explanation'] = escape(format_explanation(
                    doc._explanation))
            results.append(result)

    except (ESTimeoutError, ESMaxRetryError, ESException), exc:
        # Handle timeout and all those other transient errors with a
        # "Search Unavailable" rather than a Django error page.
        if is_json:
            return HttpResponse(json.dumps({'error':
                                             _('Search Unavailable')}),
                                mimetype=mimetype, status=503)

        if isinstance(exc, ESTimeoutError):
            statsd.incr('search.esunified.timeouterror')
        elif isinstance(exc, ESMaxRetryError):
            statsd.incr('search.esunified.maxretryerror')
        elif isinstance(exc, ESException):
            statsd.incr('search.esunified.elasticsearchexception')

        import logging
        logging.exception(exc)

        t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html'
        return jingo.render(request, t, {'q': cleaned['q']}, status=503)