def __init__(self): """ Set initial parameters. """ self.cfg = Config() # Load xapian indexes # self.axi_programs = xapian.Database(cfg.axi_programs) self.axi_desktopapps = xapian.Database(self.cfg.axi_desktopapps) if self.cfg.popcon: # self.popcon_programs = xapian.Database(cfg.popcon_programs) self.popcon_desktopapps = xapian.Database( self.cfg.popcon_desktopapps) # Load valid programs, desktopapps and tags # format: one package or tag name per line # self.valid_programs = [] self.valid_desktopapps = [] self.valid_tags = [] logging.info("Loading recommender filters") # with open(os.path.join(cfg.filters_dir,"programs")) as pkgs: # self.valid_programs = [line.strip() for line in pkgs # if not line.startswith("#")] with open(os.path.join(self.cfg.filters_dir, "desktopapps")) as pkgs: self.valid_desktopapps = [line.strip() for line in pkgs if not line.startswith("#")] with open(os.path.join(self.cfg.filters_dir, "debtags")) as tags: self.valid_tags = [line.strip() for line in tags if not line.startswith("#")] # Set xapian index weighting scheme if self.cfg.weight == "bm25": self.weight = xapian.BM25Weight(self.cfg.bm25_k1, self.cfg.bm25_k2, self.cfg.bm25_k3, self.cfg.bm25_b, self.cfg.bm25_nl) else: self.weight = xapian.TradWeight() self.set_strategy(self.cfg.strategy)
def select_weight(option): if option == 0: bm = xapian.BB2Weight(1.0) elif option == 1: bm = xapian.BM25PlusWeight(1.0, 0, 1.0, 0.5, 0.5, 1.0) elif option == 2: bm = xapian.BM25Weight(1.0, 0.0, 1.0, 0.5, 0.3) elif option == 3: bm = xapian.BoolWeight() elif option == 4: bm = xapian.CoordWeight() elif option == 5: bm = xapian.DLHWeight() #maybe some problem elif option == 6: bm = xapian.DPHWeight() elif option == 7: bm = xapian.IfB2Weight(1) elif option == 8: bm = xapian.IneB2Weight(1) elif option == 9: bm = xapian.InL2Weight(1) elif option == 10: bm = xapian.LMWeight( 0.0, 1, -1.0, -1.0) #the second parameter is TWO_STAGE_SMOOTHING elif option == 11: bm = xapian.PL2PlusWeight(1, 0.8) elif option == 12: bm = xapian.PL2Weight(1) elif option == 13: bm = xapian.TfIdfWeight("ntn") elif option == 14: bm = xapian.TradWeight(1.0) return bm
def search(self, query, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, limit_to_registered_models=True, result_class=None, **kwargs): """ Executes the Xapian::query as defined in `query`. Required arguments: `query` -- Search query to execute Optional arguments: `sort_by` -- Sort results by specified field (default = None) `start_offset` -- Slice results from `start_offset` (default = 0) `end_offset` -- Slice results at `end_offset` (default = None), if None, then all documents `fields` -- Filter results on `fields` (default = '') `highlight` -- Highlight terms in results (default = False) `facets` -- Facet results on fields (default = None) `date_facets` -- Facet results on date ranges (default = None) `query_facets` -- Facet results on queries (default = None) `narrow_queries` -- Narrow queries (default = None) `spelling_query` -- An optional query to execute spelling suggestion on `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True) Returns: A dictionary with the following keys: `results` -- A list of `SearchResult` `hits` -- The total available results `facets` - A dictionary of facets with the following keys: `fields` -- A list of field facets `dates` -- A list of date facets `queries` -- A list of query facets If faceting was not used, the `facets` key will not be present If `query` is None, returns no results. If `INCLUDE_SPELLING` was enabled in the connection options, the extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser and any suggestions for spell correction will be returned as well as the results. """ if xapian.Query.empty(query): return { 'results': [], 'hits': 0, } database = self._database() if result_class is None: result_class = SearchResult if self.include_spelling is True: spelling_suggestion = self._do_spelling_suggestion( database, query, spelling_query) else: spelling_suggestion = '' if narrow_queries is not None: query = xapian.Query( xapian.Query.OP_AND, query, xapian.Query(xapian.Query.OP_AND, [ self.parse_query(narrow_query) for narrow_query in narrow_queries ])) if limit_to_registered_models: registered_models = self.build_models_list() if len(registered_models) > 0: query = xapian.Query( xapian.Query.OP_AND, query, xapian.Query(xapian.Query.OP_OR, [ xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model)) for model in registered_models ])) enquire = xapian.Enquire(database) if hasattr(settings, 'HAYSTACK_XAPIAN_WEIGHTING_SCHEME'): enquire.set_weighting_scheme( xapian.BM25Weight(*settings.HAYSTACK_XAPIAN_WEIGHTING_SCHEME)) enquire.set_query(query) if sort_by: sorter = xapian.MultiValueSorter() for sort_field in sort_by: if sort_field.startswith('-'): reverse = True sort_field = sort_field[1:] # Strip the '-' else: reverse = False # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311 sorter.add(self._value_column(sort_field), reverse) enquire.set_sort_by_key_then_relevance(sorter, True) results = [] facets_dict = { 'fields': {}, 'dates': {}, 'queries': {}, } if not end_offset: end_offset = database.get_doccount() - start_offset matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) for match in matches: app_label, module_name, pk, model_data = pickle.loads( self._get_document_data(database, match.document)) if highlight: model_data['highlighted'] = { self.content_field_name: self._do_highlight(model_data.get(self.content_field_name), query) } results.append( result_class(app_label, module_name, pk, match.percent, **model_data)) if facets: facets_dict['fields'] = self._do_field_facets(results, facets) if date_facets: facets_dict['dates'] = self._do_date_facets(results, date_facets) if query_facets: facets_dict['queries'] = self._do_query_facets( results, query_facets) return { 'results': results, 'hits': self._get_hit_count(database, enquire), 'facets': facets_dict, 'spelling_suggestion': spelling_suggestion, }
def get_context_data(self): # Get the query text q = self.request.GET.get('q', '') # Get the offset value try: offset = int( self.request.GET.get('offset', '0') ) if offset < 0: offset = 0 except ValueError: offset = 0 # Is it a special search? special_value = self.request.redis_conn.get("special_search:%s:%s" % ( self.request.mission.name, q, )) if special_value: self.template_name = "search/special.html" return { "q": q, "text": special_value, } # Get the results from Xapian db = xappy.SearchConnection( os.path.join( settings.SITE_ROOT, '..', "xappydb", ), ) db.set_weighting_scheme( xapian.BM25Weight( 1, # k1 0, # k2 1, # k4 0.5, # b 2, # min_normlen ) ) query = db.query_parse( q, default_op=db.OP_OR, deny = [ "mission" ], ) query=db.query_filter( query, db.query_composite(db.OP_AND, [ db.query_field("mission", self.request.mission.name), db.query_field("transcript", self.request.mission.main_transcript), ]) ) results = db.search( query=query, startrank=offset, endrank=offset+PAGESIZE, checkatleast=-1, # everything (entire xapian db fits in memory, so this should be fine) sortby="-weight", ) # Go through the results, building a list of LogLine objects redis_conn = self.request.redis_conn log_lines = [] for result in results: transcript_name, timestamp = result.id.split(":", 1) log_line = LogLine(redis_conn, transcript_name, int(timestamp)) log_line.speaker = Character(redis_conn, transcript_name.split('/')[0], result.data['speaker_identifier'][0]) log_line.title = mark_safe(result.summarise("text", maxlen=50, ellipsis='…', strict_length=True, hl=None)) log_line.summary = mark_safe(result.summarise("text", maxlen=600, ellipsis='…', hl=('<mark>', '</mark>'))) log_lines.append(log_line) def page_url(offset): return reverse("search") + '?' + urllib.urlencode({ 'q': q.encode('utf-8'), 'offset': offset, }) if offset==0: previous_page = False else: previous_page = page_url(offset - PAGESIZE) if offset+PAGESIZE > results.matches_estimated: next_page = False else: next_page = page_url(offset + PAGESIZE) thispage = offset / PAGESIZE maxpage = results.matches_estimated / PAGESIZE pages_to_show = set([0]) | set([thispage-1, thispage, thispage+1]) | set([maxpage]) if 0 == thispage: pages_to_show.remove(thispage-1) if maxpage == thispage: pages_to_show.remove(thispage+1) pages = [] class Page(object): def __init__(self, number, url, selected=False): self.number = number self.url = url self.selected = selected pages_in_order = list(pages_to_show) pages_in_order.sort() for page in pages_in_order: if len(pages)>0 and page != pages[-1].number: pages.append('...') pages.append(Page(page+1, page_url(page*PAGESIZE), page==thispage)) error_info = self.request.redis_conn.hgetall( "error_page:%s:%s" % ( self.request.mission.name, 'no_search_results', ), ) if not error_info: error_info = {} if error_info.has_key('classic_moment_quote'): error_quote = LogLine( self.request.redis_conn, self.request.mission.main_transcript, timestamp_to_seconds(error_info['classic_moment_quote']) ) else: error_quote = None return { 'log_lines': log_lines, 'result': results, 'q': q, 'previous_page': previous_page, 'next_page': next_page, 'pages': pages, 'debug': { 'query': query, }, 'error': { 'info': error_info, 'quote': error_quote, } }