Пример #1
0
	def get_ranklist(self, topic_id, run_id, session_id):
		res = self.get(topic_id=topic_id, run_id=run_id)
		frag = WholeFragmenter()
		analyzer = FancyAnalyzer()
		format = HtmlFormatter(tagname="b")

		ranklist = js.loads(res.ranklist)
		# To keep the ranking
		id_ranklist = dict([(ranklist[i], i) for i in range(len(ranklist))])
		docs = Document.objects.filter(doc_id__in=ranklist)
		bookmarks = Bookmark.objects.filter(topic_id=topic_id,
										session_id=session_id)
		# get the query for highlighting
		query = [q.text for q in analyzer(Topic.objects.get(topic_id=topic_id).topic_text)]
		# get the docids
		bookmarks = [b.doc_id.strip("_bookmark") for b in bookmarks]
		bookmarks = set(bookmarks) # faster lookup
		ids = [(d.summary, d.doc_id) for d in docs]	
		for (sum,id) in ids[:10]:
			print 
		docs = [[id_ranklist[d.doc_id], 
			{
				'id':d.doc_id, 
				'title': '.' if d.title=='' else d.title, 
				'url': d.url if len(d.url)<=80 else d.url[0:80]+'...', 
				'summary':self.get_highlighted_summary(d.summary,query,analyzer,frag,format),
				'site': d.site.site_name,
				'category': d.site.category.split(","),
				'bookmarked': 1 if d.doc_id in bookmarks else 0
			}] for d in docs]
		docs.sort(key=operator.itemgetter(0))
		
		return docs 
Пример #2
0
    def search(self):
        c.terms = request.GET.get('terms', '')
        c.results = []
        if len(c.terms) < 4:
            h.flash(
                _('Search queries must be at least 4 characters in length.'),
                'error'
            )
            redirect(url(controller='blog', action='index'))

        query = MultifieldParser(
            ['title', 'content', 'summary'],
            schema=index.schema
        ).parse(c.terms)
        results = index.searcher().search(query, limit=10)
        for result in results:
            terms = [v for k, v in query.all_terms() if k == 'content']
            url_kwargs = json.loads(result['url'])
            result['url'] = url(**url_kwargs)
            result['highlights'] = highlight(
                result['content'],
                terms,
                search.schema['content'].format.analyzer,
                ContextFragmenter(terms),
                HtmlFormatter(tagname='span', classname='highlight')
            )
            c.results.append(result)
        return render('search.tpl', slacks=True)
def get_suggestion(q):
    ix = open_dir('index')

    with ix.searcher(
    ) as searcher:  # Un searcher apre diversi file, quindi va preferibilmente usato con il with in modo da farli anche chiudere
        #   og = OrGroup.factory(0.9) # Dalla documentazione ufficiale, maggiore priorità se ci sono più termini
        #    parser = MultifieldParser(['title', 'content'], ix.schema, {'title': 1.0, 'content': 0.2}, group=og) # Dai priorità ai titoli
        # #   parser.add_plugin(PlusMinusPlugin()) # Consente di specificare con + e - termini che ci devono essere o no
        #   parser.add_plugin(FuzzyTermPlugin()) # Consente di cercare termini non precisi con la tilde (~)
        #parser.add_plugin(ForceFuzzyPlugin()) # Un mio hack, vedi dopo
        parser = QueryParser("nTitle", schema=ix.schema)
        query = parser.parse(q)
        #print(query) # Per fare debug

        results = searcher.search(query)

        results.formatter = HtmlFormatter(
            between=" &ellips; ")  # Questioni di gusto

        print(results, "schema: ", ix.schema)
        for r in results:
            titleHigh = r.highlights("title")
            print(titleHigh)
            title = titleHigh if titleHigh else r[
                'title']  # Se non c'è il termine nel titolo, senza questa linea non verrebbe stampato nulla
Пример #4
0
    def __init__(self,
                 index=None,
                 search_fields=["title", "content"],
                 html_formatter=None,
                 parser=None,
                 termclass=Term):
        """Clase para buscar por distintos fields

        :param: index
        :type: whoosh.index.Index - Instancia del objeto Index

        :param: search_fields - Lista de los campos donde se busca
        :type: list

        :param: html_formatter - Instancia que formatea los hits
        :type: whoosh.highlight.HtmlFormatter
        """

        self.index = index or open_dir(INDEX_DIR)
        self.html_formatter = html_formatter or HtmlFormatter(
            between="...",
            tagname="strong",
            classname="search-match",
            termclass="search-term")

        self.search_fields = search_fields
        self.termclass = termclass

        self.parser = parser or qparser.MultifieldParser(
            self.search_fields, self.index.schema, termclass=termclass)
Пример #5
0
def searchPage(keyword, curPage=1, pagelen=10):
    with ix.searcher() as searcher:
        # res=dict()
        # parser = QueryParser('content', schema=ix.schema)
        hf = HtmlFormatter(tagname="code", classname="match", termclass="term")
        fragmenter = WholeFragmenter(charlimit=None)
        parser = MultifieldParser(["title", "content", 'createAt'],
                                  schema=ix.schema)
        parser.add_plugin(DateParserPlugin())
        q = parser.parse(keyword)
        page = searcher.search_page(q, curPage, pagelen)  #,terms=True
        page.results.fragmenter = fragmenter
        #page.results.fragmenter.charlimit=None
        page.results.formatter = hf
        # terms = page.results.matched_terms()
        # key=[ e for e in terms ][0][1].decode('UTF-8')
        resPage = dict(pagenum=page.pagenum,
                       pagecount=page.pagecount,
                       total=page.total,
                       posts=[])
        for hint in page:
            tmp = dict()
            tmp['title'] = hint.highlights("title", minscore=0)
            tmp['author'] = hint["author"]
            tmp['location'] = hint["location"].replace(os.sep,
                                                       '/').replace('//', '/')
            if tmp['location'].startswith('/'):
                tmp['location'] = tmp['location'][1:]
            tmp['summary'] = hint.highlights(
                "summary", minscore=0
            )  #hint["content"].replace(key,"<code>%s</code>" % key)

            resPage['posts'].append(tmp)
        return resPage
Пример #6
0
def search():
    q = qp.parse(request.args.get('query'))
    limit = int(request.args.get('max_docs'))
    if limit is None or limit < 1:
        limit = 10

    results_list = []

    with ix.searcher() as searcher:
        results = searcher.search(q, limit=limit)
        results.formatter = HtmlFormatter(tagname="span")
        n_results = len(results)
        for hit in results:
            result = dict()
            result["filename"] = hit["title"]

            with open(os.path.join(path, "data", hit["filename"]),
                      encoding="utf-8") as file:
                filecontents = file.read()
            result["text"] = hit.highlights(
                "content",
                text=filecontents,
            )
            result["path"] = "static/" + hit["filename"]
            results_list.append(result)

    response = make_response(
        json.dumps({
            'results': results_list,
            'n_results': n_results
        }))
    response.headers["Access-Control-Allow-Origin"] = "*"
    response.headers['Content-Type'] = 'text/json'
    return response
Пример #7
0
    def __getitem__(self, item):
        if item.start in self.saved_results:
            return self.saved_results[item.start]

        q, mask = self._query
        sortedby, reverse = self._query_sortedby

        page: ResultsPage = self.searcher.search_page(
            q,
            mask=mask,
            filter=self._query_filter,
            pagenum=math.floor(item.start / self.page_size) + 1,
            pagelen=self.page_size,
            sortedby=sortedby,
            reverse=reverse)
        page.results.fragmenter = highlight.ContextFragmenter(surround=50)
        page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")

        if not self.first_score and len(page.results) > 0:
            self.first_score = page.results[0].score

        if self.first_score:
            page.results.top_n = list(
                map(lambda hit: (hit[0] / self.first_score, hit[1]),
                    page.results.top_n))

        self.saved_results[item.start] = page

        return page
Пример #8
0
 def get_object(self):
     indexer = Indexer('file')
     query = self.request.resolver_match.kwargs['query']
     docs = indexer.get_doc(url=self.request.resolver_match.kwargs['url'])
     if not len(docs):
         return {}
     query_list = query.split(' ')
     excerpts = highlight(docs[0]['body'], set(query_list),
                          StandardAnalyzer(), WholeFragmenter(),
                          HtmlFormatter())
     return {'body': excerpts, 'title': docs[0]['title']}
Пример #9
0
def query_index(_query, no_results=20, htmlFormat=False, dir =''):
    ix = open_dir(dir)

    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse('"|| '+_query+'"')
        results = searcher.search(query, limit=no_results)

        if htmlFormat:
            results.formatter = HtmlFormatter()
            return map(lambda y: {'index': y['index'], 'text': y.highlights('content')}
                       , sorted(results, key=lambda k: k['index']))
        else:
            return map(lambda y: {'index': y['index'], 'text': y['content']}
                       , sorted(results, key=lambda k: k['index']))
Пример #10
0
    def resultExcerpt(self, results, i, ki=None):
        # FIXME: this should not be implementation specific
        if not ki:
            r = results[i]
            name = r['kitab']
            v = r['vrr'].split('-')[0]
            m = self.th.getMeta().getLatestKitabV(name, v)
            ki = self.th.getCachedKitab(m['uri'])
        num = int(results[i]['nodeIdNum'])
        node = ki.getNodeByIdNum(num)
        n = ki.toc.next(node)

        if n:
            ub = n.globalOrder
        else:
            ub = -1
        txt = node.toText(ub)

        s = set()
        #results.query.all_terms(s) # return (field,term) pairs
        # return (field,term) pairs    # self.self.__ix_searcher.reader()
        s = results.q.existing_terms(self.indexer.reader(), phrases=True)
        #s = set([i.decode('utf_8') for i in s])
        terms = dict(
            map(lambda i: (i[1], i[0]),
                filter(lambda j: j[0] == 'content' or j[0] == 'title',
                       s))).keys()
        #print "txt = [%s]" % len(txt)
        terms = [i.decode('utf_8') for i in terms]
        snippet_dummy = txt[:min(len(txt), 512)]  # dummy summary
        snippet = highlight(txt,
                            terms,
                            analyzer,
                            SentenceFragmenter(sentencechars=".!?؟\n"),
                            HtmlFormatter(between=u"\u2026\n"),
                            top=3,
                            scorer=BasicFragmentScorer,
                            minscore=1,
                            order=FIRST)
        #snippet = highlight(txt, terms, analyzer,
        #     SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top = 3,
        #     scorer = BasicFragmentScorer, minscore = 1,
        #     order = FIRST)
        print snippet
        if len(snippet) > 1: return snippet
        else: return snippet_dummy
Пример #11
0
    def sear_paper(self):
        if not self.key_words or len(self.key_words)==0:
            return '0'
        q = [Term("textdata", k) for k in self.key_words]
        index =   open_dir(self.path_index) # open_dir('paper-index')
        searcher = index.searcher()
        results = searcher.search(Or(q))
        results.fragmenter.maxchars = 30000
        results.fragmenter.surround = 150
        print('Number of hits:', len(results))
        hf = HtmlFormatter(tagname="span",classname="match", termclass="term")
        results._set_formatter(hf)

        hl_results = [hit.highlights("textdata") for hit in results]
        #for hit in results:
        #    print(hit.highlights("textdata"))
        return hl_results
Пример #12
0
def search_query_page(ix, query_string, index_name, page=0, limits=None):
    result = {"result": [], "totalcount": 0}
    try:
        query_string = query_string
        LOG.debug("Query_string: %s", query_string)
        hf = HtmlFormatter(tagname="em", classname="match", termclass="term")
        results = yield search_index_page(ix, query_string, index_name, page,
                                          limits)
        results.results.formatter = hf
        results.results.fragmenter.charlimit = 100 * 1024
        results.results.fragmenter.maxchars = 20
        # results.results.fragmenter.surround = 5
        results_len = 0
        if results.results.has_exact_length():
            results_len = len(results)
        LOG.debug("Have %s results:", results_len)
        results_len = len(results)
        result["totalcount"] = results_len
        LOG.debug("Have %s results:", results_len)
        results_num = 0
        for hit in results:
            item = ThumbnailItem()
            results_num += 1
            LOG.debug("Result: %s", results_num)
            fields = hit.fields()
            LOG.debug("Doc_id: %s", fields["doc_id"])
            html = sqlite.get_html_by_id(fields["doc_id"], conn=DB.conn_html)
            title = hit.highlights("file_name", text=html.file_name[0:-5])
            item.title = title if title.strip() != "" else html.file_name[0:-5]
            item.title = html.file_name
            item.excerpts = hit.highlights("file_content",
                                           top=5,
                                           text=html.file_content)
            item.url = "/view/html/%s" % html.sha1
            item.date_time = html.updated_at
            item.description = html.updated_at[0:19]
            result["result"].append(item)
            yield gen.moment
    except Exception, e:
        LOG.exception(e)
        result = False
Пример #13
0
    def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs):
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        self.stopwords_file = stopwords_file
        if self.stopwords_file:
            self.stopwords = ListReader(self.stopwords_file)  # Open the stopwords file, read into a ListReader
        else:
            raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified")

        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)
        
        self.__verbose = False

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser('content', self.doc_index.schema)  # By default, we use AND grouping.
                                                                         # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        # Attempt to connect to the specified Redis cache.
        self.cache = RedisConn(host=cache_host, port=cache_port)
        self.cache.connect()
Пример #14
0
		if n > max_limit:
			n = max_limit
		
		ranked_results = sorted(doc_scores.iteritems(), key=operator.itemgetter(1), reverse=True)
		
		print "{0}  {1}  {2}".format(query_num, len(ranked_results), n)
		
		if n > 0:
			for rank in range(n):
				trec_docid = reader.stored_fields(ranked_results[rank][0])['docid']
				score_formatted = "{0:.6f}".format(ranked_results[rank][1])
				output_file.write("{0} Q0 {1} {2} {3} Exp{4}".format(query_num, trec_docid, (rank + 1), score_formatted, os.linesep))
				
				content = reader.stored_fields(ranked_results[rank][0])['content']
				
				if isinstance(whoosh_query, str):
					termz = [unicode(whoosh_query)]
				else:
					termz = [text for fieldname, text in whoosh_query.all_terms() if fieldname == 'content']
				
				from whoosh.highlight import highlight
				
				analyzer = ix.schema['content'].analyzer
				fragmenter = ContextFragmenter()
				formatter = HtmlFormatter()
				
				#print highlight(content, termz, analyzer, fragmenter, formatter)

ix.close()
input_file.close()
output_file.close()
Пример #15
0
    def search(self,
               collector,
               query_str1=None,
               query_str2=None,
               itemtypes=(),
               highlight=False):

        # rejects '*' and '?'
        if query_str1:
            for kw in (s.strip() for s in query_str1.split()):
                if not kw.replace("*", "").replace("?", "").strip():
                    return []

        wildcard = (query_str1 and any(c in query_str1 for c in "*?"))

        parser = self._parser_wild if wildcard else self._parser
        asf_parser = self._asf_parser

        with self._index.searcher() as searcher:
            andlist = []
            try:
                if query_str1:
                    andlist.append(parser.parse(query_str1))
                if query_str2:
                    andlist.append(asf_parser.parse(query_str2))
            except:
                return []

            if itemtypes:
                if len(itemtypes) > 1:
                    andlist.append(Or([Term('itemtype', t)
                                       for t in itemtypes]))
                else:
                    andlist.append(Term('itemtype', itemtypes[0]))

            query = And(andlist)

            searcher.search_with_collector(query, collector)
            hits = collector.results()

            if highlight:
                hits.fragmenter = WholeFragmenter()
                hits.formatter = HtmlFormatter(tagname='span',
                                               classname='s_match',
                                               termclass='s_term')

            if wildcard and query_str1:
                pat = query_str1.replace("-", "").replace(" ", "")
                wildmatch = re.compile(fnmatch.translate(pat))

            # Construct a result list
            results = []
            for hit in hits:
                if collector.aborted:
                    return []
                (label, path, prio, sortkey) = hit['data']

                if wildcard and query_str1:
                    if not wildmatch.match(sortkey):
                        continue

                if highlight:
                    if query_str1:
                        text = hit.highlights('content')
                    else:
                        text = hit['content']
                else:
                    text = None

                results.append((label, path, sortkey, prio, text))

            sortkey_prio_getter = itemgetter(2, 3)
            results.sort(key=sortkey_prio_getter)

            # Return
            return results
Пример #16
0
def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {'content': 1.0, 'title': 3.0}
    query_parser = MultifieldParser(['content', 'title'],
                                    schema=index_schema,
                                    fieldboosts=field_boosts,
                                    group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content':
            highlight(search_result['content'], search_terms, analyzer,
                      fragmenter, formatter),
            'url':
            search_result['url'],
            'title':
            search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count
Пример #17
0
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

#INDEX SCHEMA DEFINITION
SCHEMA = Schema(fileid=ID(unique=True),
                owner=TEXT(),
                repository=TEXT(stored=True),
                path=TEXT(stored=True),
                content=FieldType(format=Characters(),
                                  analyzer=ANALYZER,
                                  scorable=True,
                                  stored=True),
                modtime=STORED(),
                extension=TEXT(stored=True))

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)

CHGSETS_SCHEMA = Schema(
    raw_id=ID(unique=True, stored=True),
    date=NUMERIC(stored=True),
    last=BOOLEAN(),
    owner=TEXT(),
    repository=ID(unique=True, stored=True),
    author=TEXT(stored=True),
    message=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    parents=TEXT(),
    added=TEXT(),
Пример #18
0
 def highlight(self, text, words):
     fragmenter = ContextFragmenter()
     formatter = HtmlFormatter()
     analyzer = self.project_schema['text'].analyzer
     return highlight(text, words, analyzer, fragmenter, formatter, top=1)
Пример #19
0
def make_html_formatter():
    # the html formatter cannot be shared between searches easily, thus
    # we create it in a factory.
    return HtmlFormatter(tagname='strong',
                         between=u' <span class="elipsis">…</span> ')
Пример #20
0
    def __init__(self,
                 whoosh_index_dir='',
                 use_cache=True,
                 cache_host='localhost',
                 cache_port=6379,
                 **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(
                self.name,
                "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser(
                'content',
                self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[
                self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look)