Пример #1
0
 def __init__(self):
     """
     Set initial parameters.
     """
     self.cfg = Config()
     # Load xapian indexes
     # self.axi_programs = xapian.Database(cfg.axi_programs)
     self.axi_desktopapps = xapian.Database(self.cfg.axi_desktopapps)
     if self.cfg.popcon:
         # self.popcon_programs = xapian.Database(cfg.popcon_programs)
         self.popcon_desktopapps = xapian.Database(
             self.cfg.popcon_desktopapps)
     # Load valid programs, desktopapps and tags
     # format: one package or tag name per line
     # self.valid_programs = []
     self.valid_desktopapps = []
     self.valid_tags = []
     logging.info("Loading recommender filters")
     # with open(os.path.join(cfg.filters_dir,"programs")) as pkgs:
     #    self.valid_programs = [line.strip() for line in pkgs
     #                           if not line.startswith("#")]
     with open(os.path.join(self.cfg.filters_dir, "desktopapps")) as pkgs:
         self.valid_desktopapps = [line.strip() for line in pkgs
                                   if not line.startswith("#")]
     with open(os.path.join(self.cfg.filters_dir, "debtags")) as tags:
         self.valid_tags = [line.strip() for line in tags
                            if not line.startswith("#")]
     # Set xapian index weighting scheme
     if self.cfg.weight == "bm25":
         self.weight = xapian.BM25Weight(self.cfg.bm25_k1, self.cfg.bm25_k2,
                                         self.cfg.bm25_k3, self.cfg.bm25_b,
                                         self.cfg.bm25_nl)
     else:
         self.weight = xapian.TradWeight()
     self.set_strategy(self.cfg.strategy)
Пример #2
0
def select_weight(option):
    if option == 0:
        bm = xapian.BB2Weight(1.0)
    elif option == 1:
        bm = xapian.BM25PlusWeight(1.0, 0, 1.0, 0.5, 0.5, 1.0)
    elif option == 2:
        bm = xapian.BM25Weight(1.0, 0.0, 1.0, 0.5, 0.3)
    elif option == 3:
        bm = xapian.BoolWeight()
    elif option == 4:
        bm = xapian.CoordWeight()
    elif option == 5:
        bm = xapian.DLHWeight()  #maybe some problem
    elif option == 6:
        bm = xapian.DPHWeight()
    elif option == 7:
        bm = xapian.IfB2Weight(1)
    elif option == 8:
        bm = xapian.IneB2Weight(1)
    elif option == 9:
        bm = xapian.InL2Weight(1)
    elif option == 10:
        bm = xapian.LMWeight(
            0.0, 1, -1.0, -1.0)  #the second parameter is TWO_STAGE_SMOOTHING
    elif option == 11:
        bm = xapian.PL2PlusWeight(1, 0.8)
    elif option == 12:
        bm = xapian.PL2Weight(1)
    elif option == 13:
        bm = xapian.TfIdfWeight("ntn")
    elif option == 14:
        bm = xapian.TradWeight(1.0)

    return bm
Пример #3
0
    def search(self,
               query,
               sort_by=None,
               start_offset=0,
               end_offset=None,
               fields='',
               highlight=False,
               facets=None,
               date_facets=None,
               query_facets=None,
               narrow_queries=None,
               spelling_query=None,
               limit_to_registered_models=True,
               result_class=None,
               **kwargs):
        """
        Executes the Xapian::query as defined in `query`.

        Required arguments:
            `query` -- Search query to execute

        Optional arguments:
            `sort_by` -- Sort results by specified field (default = None)
            `start_offset` -- Slice results from `start_offset` (default = 0)
            `end_offset` -- Slice results at `end_offset` (default = None), if None, then all documents
            `fields` -- Filter results on `fields` (default = '')
            `highlight` -- Highlight terms in results (default = False)
            `facets` -- Facet results on fields (default = None)
            `date_facets` -- Facet results on date ranges (default = None)
            `query_facets` -- Facet results on queries (default = None)
            `narrow_queries` -- Narrow queries (default = None)
            `spelling_query` -- An optional query to execute spelling suggestion on
            `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True)

        Returns:
            A dictionary with the following keys:
                `results` -- A list of `SearchResult`
                `hits` -- The total available results
                `facets` - A dictionary of facets with the following keys:
                    `fields` -- A list of field facets
                    `dates` -- A list of date facets
                    `queries` -- A list of query facets
            If faceting was not used, the `facets` key will not be present

        If `query` is None, returns no results.

        If `INCLUDE_SPELLING` was enabled in the connection options, the
        extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser
        and any suggestions for spell correction will be returned as well as
        the results.
        """
        if xapian.Query.empty(query):
            return {
                'results': [],
                'hits': 0,
            }

        database = self._database()

        if result_class is None:
            result_class = SearchResult

        if self.include_spelling is True:
            spelling_suggestion = self._do_spelling_suggestion(
                database, query, spelling_query)
        else:
            spelling_suggestion = ''

        if narrow_queries is not None:
            query = xapian.Query(
                xapian.Query.OP_AND, query,
                xapian.Query(xapian.Query.OP_AND, [
                    self.parse_query(narrow_query)
                    for narrow_query in narrow_queries
                ]))

        if limit_to_registered_models:
            registered_models = self.build_models_list()

            if len(registered_models) > 0:
                query = xapian.Query(
                    xapian.Query.OP_AND, query,
                    xapian.Query(xapian.Query.OP_OR, [
                        xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model))
                        for model in registered_models
                    ]))

        enquire = xapian.Enquire(database)
        if hasattr(settings, 'HAYSTACK_XAPIAN_WEIGHTING_SCHEME'):
            enquire.set_weighting_scheme(
                xapian.BM25Weight(*settings.HAYSTACK_XAPIAN_WEIGHTING_SCHEME))
        enquire.set_query(query)

        if sort_by:
            sorter = xapian.MultiValueSorter()

            for sort_field in sort_by:
                if sort_field.startswith('-'):
                    reverse = True
                    sort_field = sort_field[1:]  # Strip the '-'
                else:
                    reverse = False  # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311
                sorter.add(self._value_column(sort_field), reverse)

            enquire.set_sort_by_key_then_relevance(sorter, True)

        results = []
        facets_dict = {
            'fields': {},
            'dates': {},
            'queries': {},
        }

        if not end_offset:
            end_offset = database.get_doccount() - start_offset

        matches = self._get_enquire_mset(database, enquire, start_offset,
                                         end_offset)

        for match in matches:
            app_label, module_name, pk, model_data = pickle.loads(
                self._get_document_data(database, match.document))
            if highlight:
                model_data['highlighted'] = {
                    self.content_field_name:
                    self._do_highlight(model_data.get(self.content_field_name),
                                       query)
                }
            results.append(
                result_class(app_label, module_name, pk, match.percent,
                             **model_data))

        if facets:
            facets_dict['fields'] = self._do_field_facets(results, facets)
        if date_facets:
            facets_dict['dates'] = self._do_date_facets(results, date_facets)
        if query_facets:
            facets_dict['queries'] = self._do_query_facets(
                results, query_facets)

        return {
            'results': results,
            'hits': self._get_hit_count(database, enquire),
            'facets': facets_dict,
            'spelling_suggestion': spelling_suggestion,
        }
Пример #4
0
    def get_context_data(self):
        # Get the query text
        q = self.request.GET.get('q', '')
        # Get the offset value
        try:
            offset = int(
                self.request.GET.get('offset', '0')
            )
            if offset < 0:
                offset = 0
        except ValueError:
            offset = 0

        # Is it a special search?
        special_value = self.request.redis_conn.get("special_search:%s:%s" % (
            self.request.mission.name,
            q,
        ))
        if special_value:
            self.template_name = "search/special.html"
            return {
                "q": q,
                "text": special_value,
            }

        # Get the results from Xapian
        db = xappy.SearchConnection(
            os.path.join(
                settings.SITE_ROOT,
                '..',
                "xappydb",
            ),
        )
        db.set_weighting_scheme(
            xapian.BM25Weight(
                1, # k1
                0, # k2
                1, # k4
                0.5, # b
                2, # min_normlen
            )
        )
        query = db.query_parse(
            q,
            default_op=db.OP_OR,
            deny = [ "mission" ],
        )
        query=db.query_filter(
            query,
            db.query_composite(db.OP_AND, [
                db.query_field("mission", self.request.mission.name),
                db.query_field("transcript", self.request.mission.main_transcript),
            ])
        )
        results = db.search(
            query=query,
            startrank=offset,
            endrank=offset+PAGESIZE,
            checkatleast=-1, # everything (entire xapian db fits in memory, so this should be fine)
            sortby="-weight",
        )
        # Go through the results, building a list of LogLine objects
        redis_conn = self.request.redis_conn
        log_lines = []
        for result in results:
            transcript_name, timestamp = result.id.split(":", 1)
            log_line = LogLine(redis_conn, transcript_name, int(timestamp))
            log_line.speaker = Character(redis_conn, transcript_name.split('/')[0], result.data['speaker_identifier'][0])
            log_line.title = mark_safe(result.summarise("text", maxlen=50, ellipsis='&hellip;', strict_length=True, hl=None))
            log_line.summary = mark_safe(result.summarise("text", maxlen=600, ellipsis='&hellip;', hl=('<mark>', '</mark>')))
            log_lines.append(log_line)

        def page_url(offset):
            return reverse("search") + '?' + urllib.urlencode({
                'q': q.encode('utf-8'),
                'offset': offset,
            })

        if offset==0:
            previous_page = False
        else:
            previous_page = page_url(offset - PAGESIZE)

        if offset+PAGESIZE > results.matches_estimated:
            next_page = False
        else:
            next_page = page_url(offset + PAGESIZE)

        thispage = offset / PAGESIZE
        maxpage = results.matches_estimated / PAGESIZE
        
        pages_to_show = set([0]) | set([thispage-1, thispage, thispage+1]) | set([maxpage])
        if 0 == thispage:
            pages_to_show.remove(thispage-1)
        if maxpage == thispage:
            pages_to_show.remove(thispage+1)
        pages = []
        
        class Page(object):
            def __init__(self, number, url, selected=False):
                self.number = number
                self.url = url
                self.selected = selected
        
        pages_in_order = list(pages_to_show)
        pages_in_order.sort()
        for page in pages_in_order:
            if len(pages)>0 and page != pages[-1].number:
                pages.append('...')
            pages.append(Page(page+1, page_url(page*PAGESIZE), page==thispage))
        
        error_info = self.request.redis_conn.hgetall(
            "error_page:%s:%s" % (
                self.request.mission.name,
                'no_search_results',
            ),
        )
        if not error_info:
            error_info = {}
        if error_info.has_key('classic_moment_quote'):
            error_quote = LogLine(
                self.request.redis_conn,
                self.request.mission.main_transcript,
                timestamp_to_seconds(error_info['classic_moment_quote'])
            )
        else:
            error_quote = None
        
        return {
            'log_lines': log_lines,
            'result': results,
            'q': q,
            'previous_page': previous_page,
            'next_page': next_page,
            'pages': pages,
            'debug': {
                'query': query,
            },
            'error': {
                'info': error_info,
                'quote': error_quote,
            }
        }