Exemplo n.º 1
0
 def searchByRank(entry):
     subtop1 = Tkinter.Tk()
     scrollbarY = Tkinter.Scrollbar(subtop1)
     scrollbarX = Tkinter.Scrollbar(subtop1, orient=Tkinter.HORIZONTAL)
     scrollbarY.pack(side=Tkinter.RIGHT, fill=Tkinter.Y)
     scrollbarX.pack(side=Tkinter.BOTTOM, fill=Tkinter.X)
     LB1 = Tkinter.Listbox(subtop1, width=80, height=12, yscrollcommand=scrollbarY.set, xscrollcommand=scrollbarX.set)
     scrollbarY.config(command=LB1.yview)
     scrollbarX.config(command=LB1.xview)
     fechas = entry.split("-")
     flinferior = fechas[0]
     flisuperior = fechas[1]
     qp = QueryParser('date', schema=ix.schema)
     q = qp.parse(u"date:[" + flinferior + " to " + flisuperior + "]")
     with ix.searcher() as s:
             results = s.search(q)
             i = 1
             pos = 1
             for result in results:
                 LB1.insert(pos, "Correo " + str(i))
                 pos += 1
                 LB1.insert(pos, "Remitentes: " + result['mailFrom'])
                 pos += 1
                 LB1.insert(pos, "Destinatarios: " + result['mailTo'])
                 pos += 1
                 LB1.insert(pos, "Subject: " + result['subject'])
                 pos += 1
                 LB1.insert(pos, "\n")
                 pos += 1
                 i += 1
             LB1.insert(pos, "En este rango de fechas se han enviado " + str(i - 1) + " correos.")
             pos += 1
     LB1.pack()
     subtop1.mainloop()
Exemplo n.º 2
0
	def get(self):
	
		wikiResults = None
		jobResults = None
		projectResults = None	
	
		if 'searchScope' in request.args and 'searchTerm' in request.args:	
			
			searchTerm = request.args.get('searchTerm')	
			searchScope = request.args.get('searchScope')	
			index = open_dir('app/search/index')
			parser = QueryParser("content", schema=index.schema)
				
			with index.searcher() as searcher:
			
				if searchScope in ['everything', 'wiki']:
					wikiResults = [{'title':result['title'], 'url':'http://jhcwiki.jhc.co.uk/wiki/index.php/' + result['title'].replace(' ', '_')} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'WIKI']
				
				if searchScope in ['everything', 'jobs']:
					jobResults = [{'title':result['title'], 'url':''} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'JOB']

				if searchScope in ['everything', 'projects']:	
					projectResults = [{'title':result['title'], 'url':url_for('projects.projectDetail', projectCode = result['title'].split('-')[0].strip())} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'PROJECT']
		else:
			searchTerm = ''	
			searchScope = 'everything'
			
		return render_template('search/search.html', wikiResults=wikiResults, jobResults=jobResults , projectResults=projectResults, searchTerm=searchTerm, searchScope=searchScope, title="Search")	
Exemplo n.º 3
0
    def __call__(self, query):
        """search"""
        query = unicode(query)
        query_parser = QueryParser("description", schema=self.ix.schema)
        myquery = query_parser.parse(query)

        # Old code: too strict
        #        extendedquery = Or([myquery] +
        #                           [Term(field, query) for field in self.keywords])

        # New code: too permissive
        #        extendedquery = [myquery]
        excluded = set(["AND", "OR", "NOT"])
        terms = [i for i in query.split() if i not in excluded]
        #        for field in self.keywords:
        #            extendedquery.extend([Term(field, term) for term in terms])
        #        extendedquery = Or(extendedquery)

        # Code should look something like
        # Or([myquery] + [Or(
        # extendedquery = [myquery]
        extendedquery = And(
            [
                Or(
                    [myquery]
                    + [Term("description", term), Term("name", term)]
                    + [Term(field, term) for field in self.keywords]
                )
                for term in terms
            ]
        )

        # perform the search
        searcher = self.ix.searcher()
        return [i["name"] for i in searcher.search(extendedquery, limit=None)]
Exemplo n.º 4
0
    def numeroDeCorreos2(var):
        with ix.searcher() as searcher:
            query = QueryParser("name", ix2.schema)
            qp = query.parse(unicode(var))

            with ix2.searcher() as s:
                results = s.search(qp)
                mail = results[0]["mail"]
            query = QueryParser("mailTo", ix.schema).parse(mail)
            results = searcher.search(query)
            panel = Tkinter.Toplevel()
            scrollbar = Tkinter.Scrollbar(panel)
            scrollbar.pack(side=Tkinter.RIGHT, fill=Tkinter.Y)
            listado = Tkinter.Text(panel, width=150, height=30, yscrollcommand=scrollbar.set)
            i = 1
            for result in results:
                listado.insert(Tkinter.INSERT, "Mail from: " + result["mailFrom"])
                listado.insert(Tkinter.INSERT, "Mail to: " + result["mailTo"])
                listado.insert(Tkinter.INSERT, "Subject: " + result["subject"])
                date = result["date"]
                listado.insert(Tkinter.INSERT, "Date: " + date[:4] + "-" + date[4:6] + "-" + date[6:])
                listado.insert(Tkinter.INSERT, "Content: ")
                content = re.findall("'([^']*)'", result["content"])
                last = content[-1]
                i += 1
                for line in content:
                    if line is not last:
                        line = line[:-2]
                    listado.insert(Tkinter.INSERT, line + "\n")
                listado.insert(Tkinter.INSERT, "\n")
            listado.insert(Tkinter.INSERT, "Este remitente ha enviado " + str(i - 1) + " correos.")
            scrollbar.config(command=listado.yview)
            listado.pack()
Exemplo n.º 5
0
def test_wildcard_existing_terms():
    s = fields.Schema(key=fields.ID, value=fields.TEXT)
    ix = RamStorage().create_index(s)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta"))
    w.add_document(key=u("a"), value=u("boggle echo render rendering renders"))
    w.commit()
    r = ix.reader()
    qp = QueryParser("value", ix.schema)

    def words(terms):
        z = []
        for t in terms:
            assert t[0] == "value"
            z.append(t[1])
        return " ".join(sorted(z))

    q = qp.parse(u("b*"))
    ts = q.existing_terms(r)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "bear boggle bravo")

    q = qp.parse(u("[a TO f]"))
    ts = q.existing_terms(r)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "alfa bear boggle bravo charlie delta echo")

    q = query.Variations("value", "render")
    ts = q.existing_terms(r, expand=False)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "render rendering renders")
Exemplo n.º 6
0
def finddocs(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True):
    ix = open_dir(indexdir)
    res=[]
    daycount={}
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        myquery = parser.parse(query)
        if distribution:
            myfacet=Facets().add_field("date",maptype=sorting.UnorderedList)
            if daterange!=None:
                datequery=DateRange("date", daterange[0],daterange[1])
                results=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)
            else:
                results=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)
            doc_cnt=0
            for day,docs in results.groups().iteritems():
                daycount[day]=len(docs)
                for result in docs:
                    if doc_cnt in range((page-1)*ndocs,page*ndocs):
                        res.append({'title':searcher.stored_fields(result)['title'],'identifier':searcher.stored_fields(result)['identifier'],'date':searcher.stored_fields(result)['date']})
                    doc_cnt+=1
            total_docs=results.estimated_length()
            return res, total_docs, daycount
        else:
            if daterange!=None:
                datequery=DateRange("date", daterange[0],daterange[1])
                results=searcher.search(datequery & myquery, limit=MAX_SEARCH_RESULTS)
            else:
                results=searcher.search(myquery, limit=MAX_SEARCH_RESULTS)
            for result in results[(page-1)*ndocs:page*ndocs]:
                res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date']})              
        total_docs=results.estimated_length()
        return res, total_docs
Exemplo n.º 7
0
    def searchNote(self):
        """ Sorting criteria: "title > path > content"
            Search matches are organized into html source.
        """

        pattern = self.searchEdit.text()
        if not pattern:
            return
        results = []
        print("Searching using", pattern)
        with self.ix.searcher() as searcher:
            matches = []
            for f in ["title", "path", "content"]:
                queryp = QueryParser(f, self.ix.schema)
                queryp.add_plugin(RegexPlugin())
                # r"pattern" is the desired regex term format
                query = queryp.parse('r"' + pattern + '"')
                ms = searcher.search(query, limit=None) # default limit is 10!
                for m in ms:
                    if not m in matches:
                        matches.append(m)

            for r in matches:
                title = r['title']
                path = r['path']
                term = r.highlights("content")
                results.append([title, path, term])

            html = ""
            for title, path, hi in results:
                html += ("<p><a href='" + path + "'>" + title +
                         "</a><br/><span class='path'>" +
                         path + "</span><br/>" + hi + "</p>")
            self.searchView.setHtml(html)
            print("Finished searching", pattern)
Exemplo n.º 8
0
def get_answer(message):

    if '/' in message[0]:
        return None

    rx = r'jova,?\s(.+)$'
    m = re.match(rx, message)
    if not m or len(m.groups(1)) < 1:
        return None

    global ix

    search_terms = m.groups(1)[0]
    parser = QueryParser("content", ix.schema)
    qry = parser.parse(search_terms)

    with ix.searcher() as searcher:
        results = searcher.search(qry)
        result = None
        if len(results) == 0:
            return None
        if len(results) == 1:
            result = results[0]
        else:
            result = random.choice(results)

        if result is None or 'path' not in result:
            return None

        return result['path'], 'plain-text'

    return None
Exemplo n.º 9
0
def findsnippets(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True):
    ix = open_dir(indexdir)
    res=[]
    daycount={}
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        myquery = parser.parse(query)
        if daterange!=None:
            datequery=DateRange("date", daterange[0],daterange[1])
            results = searcher.search(datequery & myquery,limit=MAX_SEARCH_RESULTS)
        else:
            results = searcher.search(myquery,limit=MAX_SEARCH_RESULTS)
        if distribution:
            myfacet=Facets().add_field("date",maptype=sorting.Count)
            if daterange!=None:
                datequery=DateRange("date", daterange[0],daterange[1])
                daycount_orig=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)
            else:
                daycount_orig=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)     
            for day in daycount_orig.groups():
                daycount[day]=daycount_orig.groups()[day]
            for result in results[(page-1)*ndocs:page*ndocs]:
                doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d")))
                snippet=result.highlights("content", text=doc.getcontent())
                res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet})
            total_docs=results.estimated_length()
            return res, total_docs, daycount
        else:
            for result in results[(page-1)*ndocs:page*ndocs]:
                doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d")))
                snippet=result.highlights("content", text=doc.getcontent())
                res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet})
        total_docs=results.estimated_length()
        return res, total_docs
Exemplo n.º 10
0
def test_correct_query():
    schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(a=u("alfa bravo charlie delta"))
    w.add_document(a=u("delta echo foxtrot golf"))
    w.add_document(a=u("golf hotel india juliet"))
    w.add_document(a=u("juliet kilo lima mike"))
    w.commit()

    s = ix.searcher()
    qp = QueryParser("a", ix.schema)
    qtext = u('alpha ("brovo november" OR b:dolta) detail')
    q = qp.parse(qtext, ix.schema)

    c = s.correct_query(q, qtext)
    assert c.query.__unicode__() == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)'
    assert c.string == 'alfa ("bravo november" OR b:dolta) detail'

    qtext = u('alpha b:("brovo november" a:delta) detail')
    q = qp.parse(qtext, ix.schema)
    c = s.correct_query(q, qtext)
    assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)'
    assert c.string == 'alfa b:("brovo november" a:delta) detail'

    hf = highlight.HtmlFormatter(classname="c")
    assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
Exemplo n.º 11
0
    def populateTable(self, searchterm=None):
        self.infoTable.clear()
        self.infoTable.setHorizontalHeaderLabels( [ "Title", "Authors", "Tags", "Year", "Read" ] )
        self.infoTable.setRowCount(0)
        self.infoTable.horizontalHeader().setResizeMode(0, QHeaderView.Stretch)
        self.infoTable.verticalHeader().hide()

        if( searchterm == None or searchterm == "" ):
            papers = KKDocument.objects.all()
            for p in papers:
                a = ', '.join([x.name for x in p.authors.all()])
                t = ', '.join([x.tag  for x in p.tags.all()])
                self.newEntry(p.title, a, t, p.year, p)
            return # were done here - all papers printed

        # only if there is a searchterm:
        # search full text with whoosh
        print "FINDING %s" % searchterm
        searcher = self.whoosh_ix.searcher()
        parser = QueryParser("content", schema = self.whoosh_schema)
        query = parser.parse(unicode(searchterm))
        whoosh_results = searcher.search(query)

        print "FOUND", len(whoosh_results), "Objects"

        for r in whoosh_results:
            p = KKDocument.objects.get(localFile=r['path'])
            a = ', '.join([x.name for x in p.authors.all()])
            t = ', '.join([x.tag  for x in p.tags.all()])
            self.newEntry(p.title, a, t, p.year, p)
Exemplo n.º 12
0
def grid_search(rookie_avg, surround, fragment_char_limit, whoosh_results, corpus, query_string):
    '''
    find best top parameter for whoosh snips. top parameter controls how many ... delimited fragments
    best = minimize distance w/ average size of rookie snip
    '''
    best = {""}
    best_distance_so_far = 1000000000 
    index = open_dir("indexes/{}/".format(corpus))
    corpusid = getcorpusid(corpus) 
    for top in range(1, 5): # basically fixed for now
        with index.searcher() as srch:
            query_parser = QueryParser("content", schema=index.schema)
            qry = query_parser.parse(query_string)
            results = srch.search(qry, limit=None)
            results.fragmenter.surround = surround
            results.fragmenter.maxchars = fragment_char_limit
            sum_ = 0
            for s_ix, a in enumerate(results):
                path = a.get("path").replace("/", "")
                sents = get_preproc_sentences(path, corpusid)
                sss = unicode(" ".join(sents).encode("ascii", "ignore"))
                sss = str(a.highlights("content", text=sss, top=top))
                sum_ += len(sss)
            diff = abs(rookie_avg - sum_/len(results))
            print "top of {} gives diff of {}".format(top, diff)
        if diff < best_distance_so_far:
            best = top
            best_distance_so_far = diff

    print "best top = {}".format(best)
    return best
Exemplo n.º 13
0
def crearEsquemaCorreo():
    correo1 = "1.txt"

    correoEsquema = Schema(remitente=ID(stored=True), destinatarios=KEYWORD(stored=True), fecha=DATETIME(stored=True),asunto=KEYWORD(stored=True), cuerpo=TEXT(stored=True))

    if not os.path.exists("indexCorreo"):
        os.mkdir("indexCorreo")

    iC = index.create_in("indexCorreo", correoEsquema)
    iC = open_dir("indexCorreo")
    
    writer = iC.writer()
    fecha = "20101015"
    date_email = datetime.strptime(fecha, "%Y%m%d")
    writer.add_document(remitente=u"unoarrobagmail.com", destinatarios=u"dosarrobagmail.com tresarrobagmail.com", fecha=date_email,asunto=u"Contrato de compraventa con la constructora",cuerpo=u"Estimados socios: ya hemos firmado el contrato de compraventa con el cliente preferencial. Espero noticias vuestras. Un saludo,")
    #writer.add_document(email=u"dosarrobagmail.com", name=u"Pedro Guerra")
    #writer.add_document(email=u"tresarrobagmail.com", name=u"Ana Montero")
    #writer.add_document(email=u"cuatroarrobagmail.com", name=u"Luis Pontes")
    writer.commit()
    
    qp = QueryParser("remitente", schema=iC.schema)
    q = qp.parse(u"unoarrobagmail.com")

    with iC.searcher() as s:
        results = s.search(q)
        print results[0]
Exemplo n.º 14
0
    def search_datasets(self, search_phrase, limit=None):
        """Search for just the datasets."""
        from collections import defaultdict

        from whoosh.qparser import QueryParser

        parser = QueryParser("doc", schema=self.dataset_index.schema)

        query = parser.parse(search_phrase)

        datasets = defaultdict(SearchResult)

        with self.dataset_index.searcher() as searcher:

            results = searcher.search(query, limit=limit)

            for hit in results:

                vid = hit.get('vid')
                bvid = hit.get('bvid')
                type = hit.get('type')


                datasets[bvid].vid = bvid
                if type == 'b':
                    datasets[bvid].bundle_found = True
                    datasets[bvid].b_score += hit.score
                else:
                    datasets[bvid].p_score += hit.score
                    datasets[bvid].partitions.add(vid)

        return datasets
Exemplo n.º 15
0
def OnlyOneSearch(queryStr="",index=".index"):
	ix=get_index(index)
	searcher = ix.searcher()
	parser = QueryParser("name", schema = ix.schema)
	query=parser.parse(queryStr)
	results = searcher.search(query)
	return results
Exemplo n.º 16
0
def stage3():
    ix = open_dir(index_directory)
    if not ix:
        print "No index"
        return

    parser = QueryParser("content", ix.schema)
    with ix.searcher() as searcher:

        try:
            while True:
                search_phrase = raw_input('Search phrase: ')
                if not search_phrase: continue

                search_phrase = search_phrase.decode(sys.stdin.encoding)

                myquery = parser.parse(search_phrase)
                results = searcher.search(myquery)

                if results:
                    for result in results:
                        print "%s - %s (%s)" % (result['url'],result['title'], result['company'])

                else:
                    print "No matching results"

                print "\r\n"

        except KeyboardInterrupt:
            print "\nBae..."
            return
Exemplo n.º 17
0
def index_query(environ, **kwargs):
    """
    Return a generator of tiddlers that match
    the provided arguments.
    """
    logging.debug('entering with %s', environ)
    print 'getting called on index_query'
    config = environ['tiddlyweb.config']
    #store = environ['tiddlyweb.store']
    query_parts = []
    for field, value in kwargs.items():
        if field == 'tag':
            field = 'tags'
        query_parts.append('%s:%s' % (field, value))
    query_string = ' '.join(query_parts)

    print 'getting inside on index_query'
    schema = config.get('wsearch.schema', SEARCH_DEFAULTS['wsearch.schema'])
    searcher = get_searcher(config)
    parser = QueryParser('text', schema=Schema(**schema))
    query = parser.parse(query_string)
    logging.debug('query parsed to %s' % query)
    results = searcher.search(query)

    def tiddler_from_result(result):
        print 'r', result
        bag, title = result['id'].split(':', 1)
        tiddler = Tiddler(title, bag)
        return tiddler
        #return store.get(tiddler)

    for result in results:
        yield tiddler_from_result(result)
    return
Exemplo n.º 18
0
def search(q, default_field="content"):
    ix = index.open_dir(SEARCH_INDEX)
    searcher = ix.searcher()
    parser = QueryParser(default_field, schema=ix.schema)
    query = parser.parse(q)
    results = searcher.search(query)
    return results
Exemplo n.º 19
0
def search(request):
    hits = []
    results = []
    query = request.GET.get('q', None)
    newspaper = request.GET.get('newspaper', None)
    if newspaper is not None:
        index_dir = "C:/Django Projects/searcher/modules/index" + newspaper
        ix = index.open_dir(index_dir)
        searcher = ix.searcher()
        if query is not None and query != u"":
            query = query.replace('+', ' AND ').replace(' -', ' NOT ')
            parser = QueryParser("content", schema=ix.schema)
            try:
                qry = parser.parse(query)
            except:
                qry = None
            if qry is not None:
                hits = searcher.search(qry)

        for hit in hits:
            title = hit['title']
            url = hit['url']
            date = hit['date']
            highlights = hit.highlights("content")
            keywords_list = [keyword for keyword, score in searcher.key_terms_from_text("content", hit['content'])]
            keywords = ", ".join(keywords_list)
            results.append(Result(title,url,date,highlights,keywords))
            


    variables = RequestContext(request, {
        'query': query,
        'hits': results
    })
    return render_to_response('search.html', variables)
Exemplo n.º 20
0
def test_correct_spell_field():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(text=u"rendering shading modeling reactions")

        with ix.searcher() as s:
            text = s.schema["text"]
            spell_text = s.schema["spell_text"]

            r = s.reader()
            words = [text.from_bytes(t) for t in r.lexicon("text")]
            assert words == ["model", "reaction", "render", "shade"]

            words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")]
            assert words == ["modeling", "reactions", "rendering", "shading"]

            qp = QueryParser("text", s.schema)
            qtext = u"renderink"
            q = qp.parse(qtext, s.schema)

            r = s.search(q)
            assert len(r) == 0

            c = s.correct_query(q, qtext)
            assert c.string == "rendering"
            assert c.query == query.Term("text", "rendering")

            hf = highlight.HtmlFormatter(classname="c")
            assert c.format_string(hf) == '<strong class="c term0">rendering</strong>'
Exemplo n.º 21
0
def search_files(index_dir, content):
    """
	search file content in index 
	if not hit: return False
	if hit: return results
	"""
    index_exist = index.exists_in(index_dir)
    if not index_exist:
        print ("index not exist")
        return False
    ix = index.open_dir(index_dir)
    content = unicode(content)
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        query = parser.parse(content)
        # whoosh.searching.Results
        results = searcher.search(query)
        print (type(results))
        l = len(results)
        print l
        for h in results:
            # whoosh.searching.Hit
            print type(h)
            print h
        return results
    return False
Exemplo n.º 22
0
def test_all_terms():
    q = QueryParser("a", None).parse(u('hello b:there c:"my friend"'))
    ts = q.all_terms(phrases=False)
    assert sorted(ts) == [("a", "hello"), ("b", "there")]
    ts = q.all_terms(phrases=True)
    assert sorted(ts) == [("a", "hello"), ("b", "there"), ("c", "friend"),
                          ("c", "my")]
Exemplo n.º 23
0
def whooshSearch(searchTerm, brandTerm):
	brandTerm = brandTerm.lower()
	
	#Open index
	ix = index.open_dir("index")
	
	#Identify which field
	parserFood = QueryParser("tag", ix.schema)
	queryFood = parserFood.parse(searchTerm)
	
	
	searcher = ix.searcher()
	#Create filter
	if brandTerm == "":
		brandTerm = "general"
		allow_q = Term("source", brandTerm)
		resultFoodGen = searcher.search(queryFood,filter = allow_q, limit = 7)
		resultFoodNoGen = searcher.search(queryFood,mask = allow_q, limit = 1000)
		resultFood = [each for each in resultFoodGen] + [each for each in resultFoodNoGen]
	
	else:
		allow_q = Term("source", brandTerm)
		resultFood = searcher.search(queryFood,filter = allow_q, limit = 1000)

	resultList = []	
	for i in resultFood:
		resultList.append(i["id"])
	searcher.close()
	return resultList
Exemplo n.º 24
0
def whoosh(q_query, raw_query, page):
	results = []
	try:
		import internal_search
		with internal_search.ix.searcher() as searcher:
			from whoosh.qparser import QueryParser
			parser = QueryParser("keywords", internal_search.ix.schema)
			myquery = parser.parse('"%s"' %raw_query)
			w_results = searcher.search(myquery)
			for r in w_results:
				s = ''
				if 'official' in r:
					s = ' official'
				if not 'twitter' in r:
					t = ''
				else:
					t = r['twitter']
				results.append({
					"style" : "internal_search%s" % s,
					"title" : r['title'],
					"url" : r['link'],
					"snippet" : r['content'],
					"display_url" :  r['link'],
					"twitter" : t
				})
	except Exception as ex:
		import traceback
		results.append({
			"style" : "error",
			"title" : "An error occured: %s" % repr(ex) + "<br/>" + traceback.format_exc().replace("\n", "<br/>"),
			"snippet" : "<pre>%s</pre>" % debug_info
		})
	return results
Exemplo n.º 25
0
Arquivo: store.py Projeto: leifj/pyFF
    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return b2u(list(self.objects.values()))
            else:
                return b2u(list(self.infos.values()))

        from whoosh.qparser import QueryParser
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return b2u(list(lst))
Exemplo n.º 26
0
def fulltext_search(request, query, limit):
    # query index
    index = get_index()
    parser = QueryParser('value', index.schema)
    with index.searcher() as searcher:
        query = parser.parse(query)
        results = searcher.search(query, limit=limit)
        results = results_to_instances(request, results)

    # helper for limit
    def limited(res):
        if limit is not None:
            return res[:limit]
        return res

    # if authenticated, return all draft and published results
    authenticated = bool(authenticated_userid(request))
    if authenticated:
        return limited(
            [res for res in results if res.state in ['draft', 'published']])
    # check for submitter
    submitter = get_submitter(request)
    if submitter:
        return limited(
            [res for res in results if res.state == 'published' or
                (res.state == 'draft' and submitter == res.submitter)])
    # return only public results
    return limited([res for res in results if res.state == 'published'])
Exemplo n.º 27
0
 def search(self, q, page = 1, size = 30):
     searcher = self.ix.searcher()
     parser = QueryParser("content", schema=self.ix.schema, group=OrGroup)
     parser_rlt = parser.parse(q)
     #print unicode(parser_rlt)
     results = searcher.search_page(parser_rlt, page, size)
     return self.parse_results(results)
Exemplo n.º 28
0
 def searchNote(self):
     pattern = self.searchEdit.text()
     qres = []
     with self.ix.searcher() as searcher:
         queryp = QueryParser("content", self.ix.schema)
         queryp.add_plugin(RegexPlugin())
         query = queryp.parse('r"' + pattern + '"')
                              # r"pattern" is the desired regex term format
         pathFacet = sorting.FieldFacet("path")
         scores = sorting.ScoreFacet()
         results = searcher.search(
             query, limit=None, sortedby=[pathFacet, scores])  # default limit is 10!
         for r in results:
             listItem = QListWidgetItem()
             title = r['title']
             text = r['path']
             term = r.highlights("content")
             qres.append([title, text, term])
         html = """
                 <style>
                     body { font-size: 14px; }
                     .path { font-size: 12px; color: #009933; }
                 </style>
                """
         for ti, te, hi in qres:
             html += ("<p><a href='" + te + "'>" + ti + 
                      "</a><br/><span class='path'>" + 
                     te + "</span><br/>" + hi + "</p>")
         self.searchView.setHtml(html)
Exemplo n.º 29
0
    def __init__(self, search_term):

        search_term = re.sub( and_regex, ' AND ',  search_term )
        search_term = re.sub( or_regex, ' OR ', search_term)

        parser = QueryParser("content", schema=None)
        q = parser.parse(search_term)
        invalid = self.validate_search_term(q)
        if invalid:
            raise ValueError(invalid + search_term)

        myapp.db_connector.connect()
        session = myapp.db_connector.get_session()

        subq = session.query( TourneyList.id.label("tourney_list_id"),
                              TourneyVenue.country.label("country_name"),
                              TourneyVenue.state.label("state_name"),
                              TourneyVenue.city.label("city_name"),
                              TourneyVenue.venue.label("venue_name"),
                              Tourney.tourney_type.label("tourney_type"),
                              func.group_concat( ShipPilot.ship_type.distinct()).label("ship_name" ),
                              func.group_concat( func.concat( Pilot.name, " ", Pilot.canon_name )).label("pilot_name"),
                              func.group_concat( func.concat( Upgrade.name, " ", Upgrade.canon_name ) ).label("upgrade_name") ). \
            join(Tourney).\
            join(TourneyVenue).\
            join(Ship). \
            join(ShipPilot). \
            join(Pilot). \
            outerjoin(ShipUpgrade). \
            outerjoin(Upgrade).\
            group_by( TourneyList.id).subquery()


        fn  = tree_to_expr(q, subq)
        self.query = session.query(subq.c.tourney_list_id).filter( fn )
Exemplo n.º 30
0
def test_correct_query():
    schema = fields.Schema(a=fields.TEXT(), b=fields.TEXT)
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(a=u"alfa bravo charlie delta")
            w.add_document(a=u"delta echo foxtrot golf")
            w.add_document(a=u"golf hotel india juliet")
            w.add_document(a=u"juliet kilo lima mike")

        with ix.searcher() as s:
            qp = QueryParser("a", ix.schema)
            qtext = u'alpha ("brovo november" OR b:dolta) detail'
            q = qp.parse(qtext, ix.schema)

            c = s.correct_query(q, qtext)
            cq = c.query
            assert isinstance(cq, query.And)
            assert cq[0].text == "alfa"
            assert isinstance(cq[1], query.Or)
            assert isinstance(cq[1][0], query.Phrase)
            assert cq[1][0].words == ["bravo", "november"]

            qtext = u'alpha b:("brovo november" a:delta) detail'
            q = qp.parse(qtext, ix.schema)
            c = s.correct_query(q, qtext)
            assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)'
            assert c.string == 'alfa b:("brovo november" a:delta) detail'

            hf = highlight.HtmlFormatter(classname="c")
            assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
Exemplo n.º 31
0
Quick Start
    Whoosh是一个索引文本和搜索文本的类库,他可以为你提供搜索文本的服务,比如如果你在创建一个博客的软件,你可以用whoosh为它添加添加一个搜索功能以便用户来搜索博客的入口
下面是一个简短的例子:
from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title = TEXT(stored = True),path = ID(stored=True),content=TEXT)
ix = create_in("/home/gswewf/百科/indexer",schema)#(这里的“indexer”实际上是一个目录,因此按照这个步骤来会出错,你得先创建目录,译者注)
writer = ix.writer()
writer.add_document(title=u"First document",path=u"/a",
                    content = u"this is the first document we've add!")
writer.add_document(title=u"Second document", path=u"/b",
                        ...                     content=u"The second one is even more interesting!")
writer.commit()
from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    results[0]
{"title": u"First document", "path": u"/a"}

Index和Schema对象

在 开始使用whoosh之前,你需要一个index对象,在你第一次创建index对象时你必须定义一个Schema对象,Schema对象列出了 Index的所有域。
一个域就是Index对象里面每个document的一个信息,比如他的题目或者他的内容。一个域能够被索引(就是能被搜索到)或者 被存储(就是得到索引之后的结果,
这对于标题之类的索引非常有用)
下面这个schema对象有两个域,“title”和“content”

from whoosh.fields import Schema,TEXT
schema = Schema(title=TEXT,content=TEXT)

当你创建index的时候你创建一次schema对象就够了,schema是序列化并且和index存储的。
Exemplo n.º 32
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT',
                                  128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if getattr(LOCALS, 'RAM_STORE', None) is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if 'boost' in doc:
                    del doc['boost']

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       "data": {
                                           "index": index,
                                           "object": get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' %
                                                           (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Whoosh: %s",
                           whoosh_id,
                           e,
                           exc_info=True)

    def clear(self, models=None, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append(u"%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.index.delete_by_query(
                    q=self.parser.parse(u" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Whoosh index of models '%s': %s",
                    ','.join(models_to_delete),
                    e,
                    exc_info=True)
            else:
                self.log.error("Failed to clear Whoosh index: %s",
                               e,
                               exc_info=True)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if end_offset is not None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self,
               query_string,
               sort_by=None,
               start_offset=0,
               end_offset=None,
               fields='',
               highlight=False,
               facets=None,
               date_facets=None,
               query_facets=None,
               narrow_queries=None,
               spelling_query=None,
               within=None,
               dwithin=None,
               distance_point=None,
               models=None,
               limit_to_registered_models=None,
               result_class=None,
               **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.",
                          Warning,
                          stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.",
                          Warning,
                          stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.",
                          Warning,
                          stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(
                start_offset, end_offset)

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num,
                                                **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            results = self._process_results(raw_page,
                                            highlight=highlight,
                                            query_string=query_string,
                                            spelling_query=spelling_query,
                                            result_class=result_class)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(
                        spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(
                        query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        if not self.setup_complete:
            self.setup()

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        searcher = None
        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name,
                                                        top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)

        if searcher:
            searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results

    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) == 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name), terms,
                        sa, ContextFragmenter(), formatter)
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_text(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = 'true'
            else:
                value = 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_text(v) for v in value])
        elif isinstance(value, (six.integer_types, float)):
            # Leave it alone.
            pass
        else:
            value = force_text(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False

        if value and isinstance(value, six.string_types):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(date_values['year'], date_values['month'],
                                date_values['day'], date_values['hour'],
                                date_values['minute'], date_values['second'])

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(
                    converted_value,
                (list, tuple, set, dict, six.integer_types, float, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
Exemplo n.º 33
0
                analyzer=analyzer)

if not os.path.exists("tmp"):
    os.mkdir("tmp")

ix = create_in("tmp", schema)
writer = ix.writer()
# add different docs
writer.add_document(title="document1",
                    path="/a",
                    content="This is the first document we've added!")

writer.add_document(title="document2",
                    path="/b",
                    content="The second one 用来测试中文吧 is even more interesting!")

writer.commit()
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)

for keywords in ("你", "first", "中文"):
    print(keywords + "results are as following: ")
    q = parser.parse(keywords)
    results = searcher.search(q)
    for hit in results:
        print(hit.highlights("content"))
    print("\n------------cut line-------------\n")

for t in analyzer(""):
    print(t.text)
class WhooshSearchBackendTestCase(WhooshTestCase):
    fixtures = ['bulk_data.json']

    def setUp(self):
        super(WhooshSearchBackendTestCase, self).setUp()

        self.old_ui = connections['whoosh'].get_unified_index()
        self.ui = UnifiedIndex()
        self.wmmi = WhooshMockSearchIndex()
        self.wmmidni = WhooshMockSearchIndexWithSkipDocument()
        self.wmtmmi = WhooshMaintainTypeMockSearchIndex()
        self.ui.build(indexes=[self.wmmi])
        self.sb = connections['whoosh'].get_backend()
        connections['whoosh']._index = self.ui

        self.sb.setup()
        self.raw_whoosh = self.sb.index
        self.parser = QueryParser(self.sb.content_field_name,
                                  schema=self.sb.schema)
        self.sb.delete_index()

        self.sample_objs = MockModel.objects.all()

    def tearDown(self):
        connections['whoosh']._index = self.old_ui
        super(WhooshSearchBackendTestCase, self).tearDown()

    def whoosh_search(self, query):
        self.raw_whoosh = self.raw_whoosh.refresh()
        searcher = self.raw_whoosh.searcher()
        return searcher.search(self.parser.parse(query), limit=1000)

    def test_non_silent(self):
        bad_sb = connections['whoosh'].backend('bad',
                                               PATH='/tmp/bad_whoosh',
                                               SILENTLY_FAIL=False)
        bad_sb.use_file_storage = False
        bad_sb.storage = 'omg.wtf.bbq'

        try:
            bad_sb.update(self.wmmi, self.sample_objs)
            self.fail()
        except:
            pass

        try:
            bad_sb.remove('core.mockmodel.1')
            self.fail()
        except:
            pass

        try:
            bad_sb.clear()
            self.fail()
        except:
            pass

        try:
            bad_sb.search('foo')
            self.fail()
        except:
            pass

    def test_update(self):
        self.sb.update(self.wmmi, self.sample_objs)

        # Check what Whoosh thinks is there.
        self.assertEqual(len(self.whoosh_search(u'*')), 23)
        self.assertEqual(
            [doc.fields()['id'] for doc in self.whoosh_search(u'*')],
            [u'core.mockmodel.%s' % i for i in range(1, 24)])

    def test_update_with_SkipDocument_raised(self):
        self.sb.update(self.wmmidni, self.sample_objs)

        # Check what Whoosh thinks is there.
        res = self.whoosh_search(u'*')
        self.assertEqual(len(res), 14)
        ids = [1, 2, 5, 6, 7, 8, 9, 11, 12, 14, 15, 18, 20, 21]
        self.assertListEqual([doc.fields()['id'] for doc in res],
                             [u'core.mockmodel.%s' % i for i in ids])

    def test_remove(self):
        self.sb.update(self.wmmi, self.sample_objs)
        self.assertEqual(self.sb.index.doc_count(), 23)

        self.sb.remove(self.sample_objs[0])
        self.assertEqual(self.sb.index.doc_count(), 22)

    def test_clear(self):
        self.sb.update(self.wmmi, self.sample_objs)
        self.assertEqual(self.sb.index.doc_count(), 23)

        self.sb.clear()
        self.assertEqual(self.sb.index.doc_count(), 0)

        self.sb.update(self.wmmi, self.sample_objs)
        self.assertEqual(self.sb.index.doc_count(), 23)

        self.sb.clear([AnotherMockModel])
        self.assertEqual(self.sb.index.doc_count(), 23)

        self.sb.clear([MockModel])
        self.assertEqual(self.sb.index.doc_count(), 0)

        self.sb.index.refresh()
        self.sb.update(self.wmmi, self.sample_objs)
        self.assertEqual(self.sb.index.doc_count(), 23)

        self.sb.clear([AnotherMockModel, MockModel])
        self.assertEqual(self.raw_whoosh.doc_count(), 0)

    def test_search(self):
        self.sb.update(self.wmmi, self.sample_objs)
        self.assertEqual(len(self.whoosh_search(u'*')), 23)

        # No query string should always yield zero results.
        self.assertEqual(self.sb.search(u''), {'hits': 0, 'results': []})

        # A one letter query string gets nabbed by a stopwords filter. Should
        # always yield zero results.
        self.assertEqual(self.sb.search(u'a'), {'hits': 0, 'results': []})

        # Possible AttributeError?
        # self.assertEqual(self.sb.search(u'a b'), {'hits': 0, 'results': [], 'spelling_suggestion': '', 'facets': {}})

        self.assertEqual(self.sb.search(u'*')['hits'], 23)
        self.assertEqual(
            [result.pk for result in self.sb.search(u'*')['results']],
            [u'%s' % i for i in range(1, 24)])

        self.assertEqual(self.sb.search(u'Indexe')['hits'], 23)
        self.assertEqual(
            self.sb.search(u'Indexe')['spelling_suggestion'], u'indexed')

        self.assertEqual(self.sb.search(u'', facets=['name']), {
            'hits': 0,
            'results': []
        })
        results = self.sb.search(u'Index*', facets=['name'])
        results = self.sb.search(u'index*', facets=['name'])
        self.assertEqual(results['hits'], 23)
        self.assertEqual(results['facets'], {})

        self.assertEqual(
            self.sb.search(u'',
                           date_facets={
                               'pub_date': {
                                   'start_date': date(2008, 2, 26),
                                   'end_date': date(2008, 2, 26),
                                   'gap': '/MONTH'
                               }
                           }), {
                               'hits': 0,
                               'results': []
                           })
        results = self.sb.search(u'Index*',
                                 date_facets={
                                     'pub_date': {
                                         'start_date': date(2008, 2, 26),
                                         'end_date': date(2008, 2, 26),
                                         'gap': '/MONTH'
                                     }
                                 })
        results = self.sb.search(u'index*',
                                 date_facets={
                                     'pub_date': {
                                         'start_date': date(2008, 2, 26),
                                         'end_date': date(2008, 2, 26),
                                         'gap': '/MONTH'
                                     }
                                 })
        self.assertEqual(results['hits'], 23)
        self.assertEqual(results['facets'], {})

        self.assertEqual(
            self.sb.search(u'', query_facets={'name': '[* TO e]'}), {
                'hits': 0,
                'results': []
            })
        results = self.sb.search(u'Index*', query_facets={'name': '[* TO e]'})
        results = self.sb.search(u'index*', query_facets={'name': '[* TO e]'})
        self.assertEqual(results['hits'], 23)
        self.assertEqual(results['facets'], {})

        # self.assertEqual(self.sb.search('', narrow_queries=set(['name:daniel1'])), {'hits': 0, 'results': []})
        # results = self.sb.search('Index*', narrow_queries=set(['name:daniel1']))
        # self.assertEqual(results['hits'], 1)

        # Ensure that swapping the ``result_class`` works.
        self.assertTrue(
            isinstance(
                self.sb.search(u'Index*',
                               result_class=MockSearchResult)['results'][0],
                MockSearchResult))

        # Check the use of ``limit_to_registered_models``.
        self.assertEqual(self.sb.search(u'', limit_to_registered_models=False),
                         {
                             'hits': 0,
                             'results': []
                         })
        self.assertEqual(
            self.sb.search(u'*', limit_to_registered_models=False)['hits'], 23)
        self.assertEqual([
            result.pk for result in self.sb.search(
                u'*', limit_to_registered_models=False)['results']
        ], [u'%s' % i for i in range(1, 24)])

        # Stow.
        old_limit_to_registered_models = getattr(
            settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
        settings.HAYSTACK_LIMIT_TO_REGISTERED_MODELS = False

        self.assertEqual(self.sb.search(u''), {'hits': 0, 'results': []})
        self.assertEqual(self.sb.search(u'*')['hits'], 23)
        self.assertEqual(
            [result.pk for result in self.sb.search(u'*')['results']],
            [u'%s' % i for i in range(1, 24)])

        # Restore.
        settings.HAYSTACK_LIMIT_TO_REGISTERED_MODELS = old_limit_to_registered_models

    def test_highlight(self):
        self.sb.update(self.wmmi, self.sample_objs)
        self.assertEqual(len(self.whoosh_search(u'*')), 23)

        self.assertEqual(self.sb.search(u'', highlight=True), {
            'hits': 0,
            'results': []
        })
        self.assertEqual(self.sb.search(u'index*', highlight=True)['hits'], 23)

        query = self.sb.search('Index*', highlight=True)['results']
        result = [result.highlighted['text'][0] for result in query]

        self.assertEqual(result,
                         ['<em>Indexed</em>!\n%d' % i for i in range(1, 24)])

    def test_search_all_models(self):
        wamsi = WhooshAnotherMockSearchIndex()
        self.ui.build(indexes=[self.wmmi, wamsi])

        self.sb.update(self.wmmi, self.sample_objs)
        self.sb.update(wamsi, AnotherMockModel.objects.all())

        self.assertEqual(len(self.whoosh_search(u'*')), 25)

        self.ui.build(indexes=[self.wmmi])

    def test_more_like_this(self):
        self.sb.update(self.wmmi, self.sample_objs)
        self.assertEqual(len(self.whoosh_search(u'*')), 23)

        # Now supported by Whoosh (as of 1.8.4). See the ``LiveWhooshMoreLikeThisTestCase``.
        self.assertEqual(
            self.sb.more_like_this(self.sample_objs[0])['hits'], 22)

        # Make sure that swapping the ``result_class`` doesn't blow up.
        try:
            self.sb.more_like_this(self.sample_objs[0],
                                   result_class=MockSearchResult)
        except:
            self.fail()

    def test_delete_index(self):
        self.sb.update(self.wmmi, self.sample_objs)
        self.assertTrue(self.sb.index.doc_count() > 0)

        self.sb.delete_index()
        self.assertEqual(self.sb.index.doc_count(), 0)

    def test_order_by(self):
        self.sb.update(self.wmmi, self.sample_objs)

        results = self.sb.search(u'*', sort_by=['pub_date'])
        self.assertEqual([result.pk for result in results['results']], [
            u'1', u'3', u'2', u'4', u'5', u'6', u'7', u'8', u'9', u'10', u'11',
            u'12', u'13', u'14', u'15', u'16', u'17', u'18', u'19', u'20',
            u'21', u'22', u'23'
        ])

        results = self.sb.search(u'*', sort_by=['-pub_date'])
        self.assertEqual([result.pk for result in results['results']], [
            u'23', u'22', u'21', u'20', u'19', u'18', u'17', u'16', u'15',
            u'14', u'13', u'12', u'11', u'10', u'9', u'8', u'7', u'6', u'5',
            u'4', u'2', u'3', u'1'
        ])

        results = self.sb.search(u'*', sort_by=['id'])
        self.assertEqual([result.pk for result in results['results']], [
            u'1', u'10', u'11', u'12', u'13', u'14', u'15', u'16', u'17',
            u'18', u'19', u'2', u'20', u'21', u'22', u'23', u'3', u'4', u'5',
            u'6', u'7', u'8', u'9'
        ])

        results = self.sb.search(u'*', sort_by=['-id'])
        self.assertEqual([result.pk for result in results['results']], [
            u'9', u'8', u'7', u'6', u'5', u'4', u'3', u'23', u'22', u'21',
            u'20', u'2', u'19', u'18', u'17', u'16', u'15', u'14', u'13',
            u'12', u'11', u'10', u'1'
        ])

        results = self.sb.search(u'*', sort_by=['-pub_date', '-id'])
        self.assertEqual([result.pk for result in results['results']], [
            u'23', u'22', u'21', u'20', u'19', u'18', u'17', u'16', u'15',
            u'14', u'13', u'12', u'11', u'10', u'9', u'8', u'7', u'6', u'5',
            u'4', u'2', u'3', u'1'
        ])

        self.assertRaises(SearchBackendError,
                          self.sb.search,
                          u'*',
                          sort_by=['-pub_date', 'id'])

    def test__from_python(self):
        self.assertEqual(self.sb._from_python('abc'), u'abc')
        self.assertEqual(self.sb._from_python(1), 1)
        self.assertEqual(self.sb._from_python(2653), 2653)
        self.assertEqual(self.sb._from_python(25.5), 25.5)
        self.assertEqual(self.sb._from_python([1, 2, 3]), u'1,2,3')
        self.assertTrue("a': 1" in self.sb._from_python({
            'a': 1,
            'c': 3,
            'b': 2
        }))
        self.assertEqual(self.sb._from_python(datetime(2009, 5, 9, 16, 14)),
                         datetime(2009, 5, 9, 16, 14))
        self.assertEqual(self.sb._from_python(datetime(2009, 5, 9, 0, 0)),
                         datetime(2009, 5, 9, 0, 0))
        self.assertEqual(self.sb._from_python(datetime(1899, 5, 18, 0, 0)),
                         datetime(1899, 5, 18, 0, 0))
        self.assertEqual(
            self.sb._from_python(datetime(2009, 5, 18, 1, 16, 30, 250)),
            datetime(2009, 5, 18, 1, 16, 30, 250))

    def test__to_python(self):
        self.assertEqual(self.sb._to_python('abc'), 'abc')
        self.assertEqual(self.sb._to_python('1'), 1)
        self.assertEqual(self.sb._to_python('2653'), 2653)
        self.assertEqual(self.sb._to_python('25.5'), 25.5)
        self.assertEqual(self.sb._to_python('[1, 2, 3]'), [1, 2, 3])
        self.assertEqual(self.sb._to_python('{"a": 1, "b": 2, "c": 3}'), {
            'a': 1,
            'c': 3,
            'b': 2
        })
        self.assertEqual(self.sb._to_python('2009-05-09T16:14:00'),
                         datetime(2009, 5, 9, 16, 14))
        self.assertEqual(self.sb._to_python('2009-05-09T00:00:00'),
                         datetime(2009, 5, 9, 0, 0))
        self.assertEqual(self.sb._to_python(None), None)

    def test_range_queries(self):
        self.sb.update(self.wmmi, self.sample_objs)

        self.assertEqual(len(self.whoosh_search(u'[d TO]')), 23)
        self.assertEqual(len(self.whoosh_search(u'name:[d TO]')), 23)
        self.assertEqual(len(self.whoosh_search(u'Ind* AND name:[d to]')), 23)
        self.assertEqual(len(self.whoosh_search(u'Ind* AND name:[to c]')), 0)

    def test_date_queries(self):
        self.sb.update(self.wmmi, self.sample_objs)

        self.assertEqual(len(self.whoosh_search(u"pub_date:20090717003000")),
                         1)
        self.assertEqual(len(self.whoosh_search(u"pub_date:20090717000000")),
                         0)
        self.assertEqual(
            len(self.whoosh_search(u'Ind* AND pub_date:[to 20090717003000]')),
            3)

    def test_escaped_characters_queries(self):
        self.sb.update(self.wmmi, self.sample_objs)

        self.assertEqual(len(self.whoosh_search(u"Indexed\!")), 23)
        self.assertEqual(
            len(self.whoosh_search(u"http\:\/\/www\.example\.com")), 0)

    def test_build_schema(self):
        ui = UnifiedIndex()
        ui.build(indexes=[AllTypesWhooshMockSearchIndex()])

        (content_field_name,
         schema) = self.sb.build_schema(ui.all_searchfields())
        self.assertEqual(content_field_name, 'text')
        self.assertEqual(len(schema.names()), 9)
        self.assertEqual(schema.names(), [
            'django_ct', 'django_id', 'id', 'is_active', 'name', 'pub_date',
            'seen_count', 'sites', 'text'
        ])
        self.assertTrue(isinstance(schema._fields['text'], TEXT))
        self.assertTrue(isinstance(schema._fields['pub_date'], DATETIME))
        self.assertTrue(isinstance(schema._fields['seen_count'], NUMERIC))
        self.assertTrue(isinstance(schema._fields['sites'], KEYWORD))
        self.assertTrue(isinstance(schema._fields['is_active'], BOOLEAN))

    def test_verify_type(self):
        old_ui = connections['whoosh'].get_unified_index()
        ui = UnifiedIndex()
        wmtmmi = WhooshMaintainTypeMockSearchIndex()
        ui.build(indexes=[wmtmmi])
        connections['whoosh']._index = ui
        sb = connections['whoosh'].get_backend()
        sb.setup()
        sb.update(wmtmmi, self.sample_objs)

        self.assertEqual(sb.search(u'*')['hits'], 23)
        self.assertEqual(
            [result.month for result in sb.search(u'*')['results']], [
                u'06', u'07', u'06', u'07', u'07', u'07', u'07', u'07', u'07',
                u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07',
                u'07', u'07', u'07', u'07', u'07'
            ])
        connections['whoosh']._index = old_ui

    @unittest.skipIf(
        settings.HAYSTACK_CONNECTIONS['whoosh'].get('STORAGE') != 'file',
        'testing writability requires Whoosh to use STORAGE=file')
    def test_writable(self):
        if not os.path.exists(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH']):
            os.makedirs(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'])

        os.chmod(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'], 0o400)

        try:
            self.sb.setup()
            self.fail()
        except IOError:
            # Yay. We failed
            pass

        os.chmod(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'], 0o755)

    def test_slicing(self):
        self.sb.update(self.wmmi, self.sample_objs)

        page_1 = self.sb.search(u'*', start_offset=0, end_offset=20)
        page_2 = self.sb.search(u'*', start_offset=20, end_offset=30)
        self.assertEqual(len(page_1['results']), 20)
        self.assertEqual([result.pk for result in page_1['results']],
                         [u'%s' % i for i in range(1, 21)])
        self.assertEqual(len(page_2['results']), 3)
        self.assertEqual([result.pk for result in page_2['results']],
                         [u'21', u'22', u'23'])

        # This used to throw an error.
        page_0 = self.sb.search(u'*', start_offset=0, end_offset=0)
        self.assertEqual(len(page_0['results']), 1)

    @unittest.expectedFailure
    def test_scoring(self):
        self.sb.update(self.wmmi, self.sample_objs)

        page_1 = self.sb.search(u'index', start_offset=0, end_offset=20)
        page_2 = self.sb.search(u'index', start_offset=20, end_offset=30)
        self.assertEqual(len(page_1['results']), 20)
        self.assertEqual(
            ["%0.2f" % result.score for result in page_1['results']], [
                '0.51', '0.51', '0.51', '0.51', '0.51', '0.51', '0.51', '0.51',
                '0.51', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40',
                '0.40', '0.40', '0.40', '0.40'
            ])
        self.assertEqual(len(page_2['results']), 3)
        self.assertEqual(
            ["%0.2f" % result.score for result in page_2['results']],
            ['0.40', '0.40', '0.40'])
Exemplo n.º 35
0
#!/usr/bin/env python

from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from whoosh.query import *

ix = open_dir("index_dir")
with ix.searcher() as searcher:
    # Poner while True si queremos introducir el input por el terminal.
    while False:
        text = input("Dime:")
        if len(text) == 0:
            break
        query = QueryParser("content", ix.schema).parse(text)
        results = searcher.search(query)
        #        print(dir(results))
        #        print(results.docs)
        for r in results:
            # Devuelve el ID global y la posición de la noticia en el documento.
            print("ID global:", r["doc"] + ".", "Número de noticia:",
                  r["num_noticia"])
            # Si hay menos de 4 ficheros, se imprime la noticia entera.
            if len(results) < 4:
                f = open("./enero/" + r["doc"], mode='r')
                f = str(f.read()).split("<DOC>")
                print(f[int(r["num_noticia"])])

# Documentos que contengan el texto "valencia"
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("valencia")
    results = searcher.search(query)
                loadFile(writer, filename)


############# END for walkFolder

walkFolder(writer, "C:\DSA")

# Commit changes
writer.commit()  # save changes

# Get input, conver to unicode
qstr = input("Input a qeury: ")

print("searching for ", qstr)

####################################
# Build query parser and parse query
####################################
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(qstr)

print(q)

####################################
# Search the content field
####################################
with ix.searcher(weighting=scoring.TF_IDF()) as s:
    results = s.search(q)
    for hit in results:
        print("Cell {} of Notebook '{}'".format(hit['cell_no'],
                                                hit['filename']))
Exemplo n.º 37
0
from django.http import JsonResponse
from documents.models import CodeOKVED, CodeOKS
from documents.models import Documents
from documents.serializers import DocumentsSerializer
from search.models import Search, AutoCompletion, SearchHistory
from search.serializers import SearchOptionsSerializer
from settings.common import SEARCH_ENGINE
from whoosh.fields import *
from whoosh.qparser import QueryParser

LOGGER = logging.getLogger('django')

if settings.SEARCH_ENGINE.get('name') == 'whoosh':
    ix = index.open_dir(SEARCH_ENGINE.get('indexdir'))
    CLIENT = ix.searcher()
    QUERY_DOC_KIND = QueryParser("doc_kind", ix.schema)
    QUERY_DOC_MARK = QueryParser("doc_mark", ix.schema)
    QUERY_DOC_NAME_RU = QueryParser("doc_name_ru", ix.schema)
    QUERY_DOC_NAME_EN = QueryParser("doc_name_en", ix.schema)
    QUERY_DOC_NAME_ANNOTATION = QueryParser("doc_annotation", ix.schema)
    QUERY_DOC_COMMENT = QueryParser("doc_comment", ix.schema)
    QUERY_DOC_FULL_MARK = QueryParser("doc_full_mark", ix.schema)
    QUERY_DOC_STATUS = QueryParser("doc_status", ix.schema)
    QUERY_TK_RUS = QueryParser("tk_rus", ix.schema)
    QUERY_MTK_DEV = QueryParser("mtk_dev", ix.schema)
    QUERY_KEYWORDS = QueryParser("keywords", ix.schema)

    QUERY_DOC_NAME_RU.parse("акустика")
    res = CLIENT.search(QUERY_DOC_NAME_RU)
    print(f'Test request results={res[0].get("doc_id")}')
Exemplo n.º 38
0
def search_database(keyword):
    with index.searcher() as searcher:
        query = QueryParser('name', index.schema).parse(keyword)
        return searcher.search(query)
Exemplo n.º 39
0
def search():
    query = request.form['query']
    q = []
    q.append(query)
    r = []  #complete path
    c = []  #preview of the paste content
    paste_date = []
    paste_size = []
    index_name = request.form['index_name']
    num_elem_to_get = 50

    # select correct index
    if index_name is None or index_name == "0":
        selected_index = get_current_index()
    else:
        selected_index = os.path.join(baseindexpath, index_name)

    # Search filename
    for path in r_serv_pasteName.smembers(q[0]):
        r.append(path)
        paste = Paste.Paste(path)
        content = paste.get_p_content().decode('utf8', 'ignore')
        content_range = max_preview_char if len(
            content) > max_preview_char else len(content) - 1
        c.append(content[0:content_range])
        curr_date = str(paste._get_p_date())
        curr_date = curr_date[0:4] + '/' + curr_date[4:6] + '/' + curr_date[6:]
        paste_date.append(curr_date)
        paste_size.append(paste._get_p_size())

    # Search full line
    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT)

    ix = index.open_dir(selected_index)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(" ".join(q))
        results = searcher.search_page(query, 1, pagelen=num_elem_to_get)
        for x in results:
            r.append(x.items()[0][1])
            paste = Paste.Paste(x.items()[0][1])
            content = paste.get_p_content().decode('utf8', 'ignore')
            content_range = max_preview_char if len(
                content) > max_preview_char else len(content) - 1
            c.append(content[0:content_range])
            curr_date = str(paste._get_p_date())
            curr_date = curr_date[0:4] + '/' + curr_date[
                4:6] + '/' + curr_date[6:]
            paste_date.append(curr_date)
            paste_size.append(paste._get_p_size())
        results = searcher.search(query)
        num_res = len(results)

    index_min = 1
    index_max = len(get_index_list())
    return render_template("search.html",
                           r=r,
                           c=c,
                           query=request.form['query'],
                           paste_date=paste_date,
                           paste_size=paste_size,
                           char_to_display=max_preview_modal,
                           num_res=num_res,
                           index_min=index_min,
                           index_max=index_max,
                           index_list=get_index_list(selected_index))
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  4 20:01:53 2019

@author: Fachry Firdaus
"""
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import open_dir
import sys

ix = open_dir("indexdir")

# query_str is query string
lanjut = "Y"

while (lanjut == "Y"):
    query_str = input('Apa yang ingin anda telusuri? \n')
    # Top 'n' documents as result
    topN = 50
    with ix.searcher(weighting=scoring.Frequency) as searcher:
        query = QueryParser("content", ix.schema).parse(query_str)
        results = searcher.search(query, limit=topN)
        for i in range(topN):
            print("Nama dokumen : ", results[i]['title'], "\tscore : ",
                  str(results[i].score), "\n", results[i]['textdata'], "\n")

    lanjut = input("Ingin menelusuri lagi? Y/N \n")
Exemplo n.º 41
0
class MySchema(SchemaClass):
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT
    tags = KEYWORD
    icon = TEXT


# stored = True 表示返回结果字段


if not os.path.exists('index'):
    os.mkdir('index')
indeeex = create_in('index', MySchema)
# 用create_in创建一个具有前述索引模式的索引存储目录对象,所有的索引将被保存在该目录(index)中。
writer = indeeex.writer()
writer.add_document(title=u"my document", content=u"this is my document", path=u"/a", tags=u"firlst short",
                    icon=u"/icons/star.png")
writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
                    tags=u"second short", icon=u"/icons/sheep.png")
writer.commit()

# searcher = indeeex.searcher()
# 和open()类似 用with方法保持状态
with indeeex.searcher() as searcher:
    # do something
    query = QueryParser("content", indeeex.schema).parse("second")
    result = searcher.search(query)
    print(result)
    print(list(result))
    if count != 0:
        score /= count
    return score
def symmetric_sentence_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_similarity(sentence1, sentence2) + sentence_similarity(sentence2, sentence1)) / 2 

 
model = {}
max_info = {}
ix = open_dir("WikiSplit")

# use BM25F to calculate the similarity between title and query
searcher = ix.searcher()
schema = ix.schema
parser = QueryParser("title",schema)
# parser.add_plugin(MultifieldPlugin(["title","sent"]))

# claimsWithId contains (id, claim) pairs
claimTogether,claimsWithId = readClaims("train.json")

# Get the entities from all claims(for time saving)
st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
NerSen = st.tag(nltk.word_tokenize(claimTogether))

# NumOfClaim record the number of claims that have been processed
NumOfClaim = 0
TaggedClaim = []
claims = {}
start = time.time()
def searchPapers_whoosh(year=None, author=None, topic=None, userQuery=None):

    # Open the existing index
    import whoosh.index as index
    import nltk
    nltk.download('wordnet')
    from nltk.stem.wordnet import WordNetLemmatizer
    lemma = WordNetLemmatizer()
    userQuery = " ".join(
        lemma.lemmatize(word, 'n') for word in userQuery.split())
    userQuery = " ".join(
        lemma.lemmatize(word, 'v') for word in userQuery.split())
    index_dir = "../index"

    ix = index.open_dir(index_dir)

    if topic == 'All the topics':
        topic = None
    if year == 'All the years':
        year = None
    # Parse with filter on fields
    from whoosh import query
    from whoosh import qparser
    from whoosh.qparser import QueryParser
    from whoosh.qparser import MultifieldParser

    with ix.searcher() as s:
        if (not userQuery):
            qp = QueryParser("id", schema=ix.schema)
            user_q = qp.parse("*")

        else:
            # 0 = importance to documents with one of the terms
            # 1 = importance to documents with all of the terms
            og = qparser.OrGroup.factory(0.8)

            # search both in title and text
            mparser = MultifieldParser(["title", "paper_text"],
                                       schema=ix.schema,
                                       group=og)
            user_q = mparser.parse(userQuery)

        # Filter results for fields
        allow_q = query.NullQuery

        if (year):
            allow_q = allow_q & query.Term("year", year)

        if (author):
            formattedAuthors = author.lower().split()
            for fa in formattedAuthors:
                fa = "*" + fa + "*"
                allow_q = allow_q & query.Wildcard("authors", fa)

        if (topic):
            topicParser = qparser.QueryParser("topic", ix.schema)
            allow_q = allow_q & topicParser.parse('"' + topic + '"')

        if (not year and not author and not topic):
            results = s.search(user_q, limit=50)
        else:
            results = s.search(user_q, filter=allow_q, limit=50)

        papers = []
        for result in results:
            papers.extend([int(result['id'])])
        return papers
Exemplo n.º 44
0
 def filter_queryset(self, request, queryset, view):
     if ('parent' in request.query_params
             and request.query_params['parent'] == ''):
         # Empty string means query for null parent
         queryset = queryset.filter(parent=None)
     try:
         q = request.query_params['q']
     except KeyError:
         return queryset
     # Short-circuit some commonly used queries
     COMMON_QUERY_TO_ORM_FILTER = {
         'asset_type:block': {
             'asset_type': 'block'
         },
         'asset_type:question': {
             'asset_type': 'question'
         },
         'asset_type:survey': {
             'asset_type': 'survey'
         },
         'asset_type:question OR asset_type:block': {
             'asset_type__in': ('question', 'block')
         }
     }
     try:
         return queryset.filter(**COMMON_QUERY_TO_ORM_FILTER[q])
     except KeyError:
         # We don't know how to short-circuit this query; pass it along to
         # the search engine
         pass
     except FieldError:
         # The user passed a query we recognized as commonly-used, but the
         # field was invalid for the requested model
         return queryset.none()
     queryset_pks = list(queryset.values_list('pk', flat=True))
     if not len(queryset_pks):
         return queryset
     # 'q' means do a full-text search of the document fields, where the
     # critera are given in the Whoosh query language:
     # https://pythonhosted.org/Whoosh/querylang.html
     search_queryset = SearchQuerySet().models(queryset.model)
     search_backend = search_queryset.query.backend
     if not isinstance(search_backend, WhooshSearchBackend):
         raise NotImplementedError(
             'Only the Whoosh search engine is supported at this time')
     if not search_backend.setup_complete:
         search_backend.setup()
     # Parse the user's query
     user_query = QueryParser('text', search_backend.index.schema).parse(q)
     # Construct a query to restrict the search to the appropriate model
     filter_query = Term(DJANGO_CT, get_model_ct(queryset.model))
     # Does the search index for this model have a field that allows
     # filtering by permissions?
     haystack_index = haystack.connections['default'].get_unified_index(
     ).get_index(queryset.model)
     if hasattr(haystack_index, 'users_granted_permission'):
         # Also restrict the search to records that the user can access
         filter_query &= Term('users_granted_permission',
                              request.user.username)
     with search_backend.index.searcher() as searcher:
         results = searcher.search(user_query,
                                   filter=filter_query,
                                   scored=False,
                                   sortedby=None,
                                   limit=None)
         if not results:
             # We got nothing; is the search index even valid?
             if not searcher.search(filter_query, limit=1):
                 # Thre's not a single entry in the search index for this
                 # model; assume the index is invalid and return the
                 # queryset untouched
                 return queryset
         pk_type = type(queryset_pks[0])
         results_pks = {
             # Coerce each `django_id` from unicode to the appropriate type,
             # usually `int`
             pk_type((x['django_id']))
             for x in results
         }
     filter_pks = results_pks.intersection(queryset_pks)
     return queryset.filter(pk__in=filter_pks)
Exemplo n.º 45
0
# #索引构建,基于路径的基本索引
# writer = ix.writer()
# count = 0
# # 遍历根目录对索引文本内容构建索引
# for root, dirs, files in os.walk(file_path, topdown=True):
#     for file in files:
#         path_t = os.path.join(root, file)
#         if path_t.split('.')[-1] != 'txt' or path_t.split('\\')[-1] =='index.txt':
#             continue
#         print("=======>"+path_t,file)
#         f = open( path_t, 'r', encoding='UTF-8')
#         content = ''
#         for line in f:
#             content = content + line
#         writer.add_document(title=file, content=content, path= path_t)
#         count =count+1
# writer.commit()
# print("==========>共索引文件%d个"%count)




#查询构建,测试索引构建的效果

from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser("content",ix.schema).parse("信息检索")
    result = searcher.search(query)
    for item in result:
        print(item)
    print(len(result))
Exemplo n.º 46
0
 def _query_projects(self, searcher, querystring, page=1):
     parser = QueryParser("text",
                          self.project_ix.schema,
                          plugins=self._query_parser_plugins())
     query = parser.parse(querystring)
     return self._search_projects(searcher, query, page=page)
path = "../data/FranchiseList.txt"
file = open(path, "r")
fileContent = str(file.read())
file.close()

lines = fileContent.split("\n")

# Indexation des documents
writer = ix.writer()
for l in lines:
    writer.add_document(Name=l)
    print(l)
writer.commit()

with ix.searcher() as searcher:
    query = QueryParser("Name", ix.schema).parse(u'Zero')
    results = searcher.search(query)

    # Résultats
    found = results.scored_length()
    if results.has_exact_length():
        print("Scored", found, "of exactly", len(results), "documents")
    else:
        low = results.estimated_min_length()
        high = results.estimated_length()

        print("Scored", found, "of between", low, "and", high, "documents")

    for r in results:
        print(r)
Exemplo n.º 48
0
from functools import reduce

import whoosh.index as index
import json

import helpers
from index_posts import schema
from whoosh.qparser import QueryParser

ix = index.open_dir("indexdir")
searcher = ix.searcher()

qp = QueryParser('tokens', schema=schema)

if __name__ == "__main__":
    successes = dict(zip(range(0, 20), [0] * 20))
    with open('../dataset.json') as dataset_file:
        for (post_id, post) in json.load(dataset_file).items():
            q = qp.parse(' '.join(
                helpers.preprocess_text(post['Children'][0]['Title'])))
            print(q)
            print(post['Children'][0]['Title'])
            print(post['Title'])
            results = searcher.search(q, limit=20)
            if len(results) > 0:
                for index, result in enumerate(results):
                    if result['post_id'] == post_id:
                        successes[index] += 1
                        print(result['title'])
                        print('Success: True')
                        break
Exemplo n.º 49
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        ".",
    )

    def __init__(self, connection_alias, **connection_options):
        super().__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, "POST_LIMIT", 128 * 1024 * 1024)
        self.path = connection_options.get("PATH")

        if connection_options.get("STORAGE", "file") != "file":
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias
            )

        self.log = logging.getLogger("haystack")

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections

        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path
            )

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if getattr(LOCALS, "RAM_STORE", None) is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(
            connections[self.connection_alias].get_unified_index().all_searchfields()
        )
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        self.parser.add_plugins([FuzzyTermPlugin])

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for _, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost
                    )
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost,
                    )
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True
                )
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored
                )
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at="start",
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True,
                )

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if "boost" in doc:
                    del doc["boost"]

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(
                        "%s while preparing object for update" % e.__class__.__name__,
                        exc_info=True,
                        extra={"data": {"index": index, "object": get_identifier(obj)}},
                    )

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            if writer.ident is not None:
                writer.join()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse('%s:"%s"' % (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Whoosh: %s",
                whoosh_id,
                e,
                exc_info=True,
            )

    def clear(self, models=None, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model)))

                self.index.delete_by_query(
                    q=self.parser.parse(" OR ".join(models_to_delete))
                )
        except Exception as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Whoosh index of models '%s': %s",
                    ",".join(models_to_delete),
                    e,
                    exc_info=True,
                )
            else:
                self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if end_offset is not None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(
        self,
        query_string,
        sort_by=None,
        start_offset=0,
        end_offset=None,
        fields="",
        highlight=False,
        facets=None,
        date_facets=None,
        query_facets=None,
        narrow_queries=None,
        spelling_query=None,
        within=None,
        dwithin=None,
        distance_point=None,
        models=None,
        limit_to_registered_models=None,
        result_class=None,
        **kwargs
    ):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {"results": [], "hits": 0}

        query_string = force_str(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != "*":
            return {"results": [], "hits": 0}

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith("-"):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError(
                    "Whoosh requires all order_by fields"
                    " to use the same sort direction"
                )

            for order_by in sort_by:
                if order_by.startswith("-"):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list

        group_by = []
        facet_types = {}
        if facets is not None:
            group_by += [
                FieldFacet(facet, allow_overlap=True, maptype=Count) for facet in facets
            ]
            facet_types.update({facet: "fields" for facet in facets})

        if date_facets is not None:

            def _fixup_datetime(dt):
                if isinstance(dt, datetime):
                    return dt
                if isinstance(dt, date):
                    return datetime(dt.year, dt.month, dt.day)
                raise ValueError

            for key, value in date_facets.items():
                start = _fixup_datetime(value["start_date"])
                end = _fixup_datetime(value["end_date"])
                gap_by = value["gap_by"]
                gap_amount = value.get("gap_amount", 1)
                gap = RelativeDelta(**{"%ss" % gap_by: gap_amount})
                group_by.append(DateRangeFacet(key, start, end, gap, maptype=Count))
                facet_types[key] = "dates"

        if query_facets is not None:
            warnings.warn(
                "Whoosh does not handle query faceting.", Warning, stacklevel=2
            )

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True
            )

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(
                " OR ".join(["%s:%s" % (DJANGO_CT, rm) for rm in model_choices])
            )

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_str(nq)), limit=None
                )

                if len(recent_narrowed_results) <= 0:
                    return {"results": [], "hits": 0}

                if narrowed_results is not None:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {"results": [], "hits": 0}

            page_num, page_length = self.calculate_page(start_offset, end_offset)

            search_kwargs = {
                "pagelen": page_length,
                "sortedby": sort_by,
                "reverse": reverse,
                "groupedby": group_by,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs["filter"] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {"results": [], "hits": 0, "spelling_suggestion": None}

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {"results": [], "hits": 0, "spelling_suggestion": None}

            results = self._process_results(
                raw_page,
                highlight=highlight,
                query_string=query_string,
                spelling_query=spelling_query,
                result_class=result_class,
                facet_types=facet_types,
            )
            searcher.close()

            if hasattr(narrow_searcher, "close"):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(
                        spelling_query
                    )
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None

            return {
                "results": [],
                "hits": 0,
                "spelling_suggestion": spelling_suggestion,
            }

    def more_like_this(
        self,
        model_instance,
        additional_query_string=None,
        start_offset=0,
        end_offset=None,
        models=None,
        limit_to_registered_models=None,
        result_class=None,
        **kwargs
    ):
        if not self.setup_complete:
            self.setup()

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True
            )

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(
                " OR ".join(["%s:%s" % (DJANGO_CT, rm) for rm in model_choices])
            )

        if additional_query_string and additional_query_string != "*":
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_str(nq)), limit=None
                )

                if len(recent_narrowed_results) <= 0:
                    return {"results": [], "hits": 0}

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        searcher = None
        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name, top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, "filter"):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {"results": [], "hits": 0, "spelling_suggestion": None}

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {"results": [], "hits": 0, "spelling_suggestion": None}

        results = self._process_results(raw_page, result_class=result_class)

        if searcher:
            searcher.close()

        if hasattr(narrow_searcher, "close"):
            narrow_searcher.close()

        return results

    def _process_results(
        self,
        raw_page,
        highlight=False,
        query_string="",
        spelling_query=None,
        result_class=None,
        facet_types=None,
    ):
        from haystack import connections

        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        facets = {}

        if facet_types:
            facets = {
                "fields": {},
                "dates": {},
                "queries": {},
            }
            for facet_fieldname in raw_page.results.facet_names():
                group = raw_page.results.groups(facet_fieldname)
                facet_type = facet_types[facet_fieldname]

                # Extract None item for later processing, if present.
                none_item = group.pop(None, None)

                lst = facets[facet_type][facet_fieldname] = sorted(
                    group.items(), key=(lambda itm: (-itm[1], itm[0]))
                )

                if none_item is not None:
                    # Inject None item back into the results.
                    none_entry = (None, none_item)
                    if not lst or lst[-1][1] >= none_item:
                        lst.append(none_entry)
                    else:
                        for i, value in enumerate(lst):
                            if value[1] < none_item:
                                lst.insert(i, none_entry)
                                break

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                        index.fields[string_key], "convert"
                    ):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) == 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(",")
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key
                            ].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del additional_fields[DJANGO_CT]
                del additional_fields[DJANGO_ID]

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter("em")
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name),
                        terms,
                        sa,
                        ContextFragmenter(),
                        formatter,
                    )
                    additional_fields["highlighted"] = {
                        self.content_field_name: [whoosh_result]
                    }

                result = result_class(
                    app_label,
                    model_name,
                    raw_result[DJANGO_ID],
                    score,
                    **additional_fields
                )
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {
            "results": results,
            "hits": hits,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_str(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, "")

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, "")

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = " ".join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, "strftime"):
            if not hasattr(value, "hour"):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = "true"
            else:
                value = "false"
        elif isinstance(value, (list, tuple)):
            value = ",".join([force_str(v) for v in value])
        elif isinstance(value, (int, float)):
            # Leave it alone.
            pass
        else:
            value = force_str(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == "true":
            return True
        elif value == "false":
            return False

        if value and isinstance(value, str):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(
                    date_values["year"],
                    date_values["month"],
                    date_values["day"],
                    date_values["hour"],
                    date_values["minute"],
                    date_values["second"],
                )

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(
                converted_value,
                (list, tuple, set, dict, int, float, complex),
            ):
                return converted_value
        except Exception:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
Exemplo n.º 50
0
]

schema = fields.Schema(keyword=fields.TEXT(stored=True),
                       index=fields.ID(stored=True),
                       content=fields.TEXT(stored=True))

if not os.path.exists("index"):
    os.mkdir('index')

ix = index.create_in('index', schema)
ix = index.open_dir('index')

writer = ix.writer()
# writer.add_document(keyword='my document', content='this is my document')
# writer.add_document(keyword='my second document', content='this is my second document')

for li, line in enumerate(lines):
    for word in list(jieba.cut(line)):
        print(word)
        writer.add_document(keyword=word, content=line, index=str(li))

writer.commit()

from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser('keyword', ix.schema).parse('老人')
    result = searcher.search(query)
    # print("=======================" + result)
    for res in result:
        print(res)
Exemplo n.º 51
0
    args_dict = comand_line_set()
    index_path = args_dict.get("indexpath")
    logpath = args_dict.get("logpath")
    user_dict_path = args_dict.get("userdictpath")
    comp_dict_path = args_dict.get("compdictpath")
    stop_word_path = args_dict.get("stopwordpath")
    modelpath = args_dict.get("modelpath")
    get_title_number = args_dict.get("titlenumber")
    get_similar_number = args_dict.get("similarnumber")

    gl.write_log(logpath, 'info', "\n\n")
    loginfo = ' word retrieval service starting...'
    gl.write_log(logpath, 'info', loginfo)

    # preload dicts to save running time
    tml.load_dicts(user_dict_path, logpath)
    tml.load_dicts(comp_dict_path, logpath)
    stopwords = tml.get_stopwords(stop_word_path, logpath)

    ix = open_dir(index_path)  # for read only
    index_searcher = ix.searcher()
    query_parser = QueryParser("segwords", schema=ix.schema)
    loginfo = ' inverted index file %s has been opened.' % index_path
    gl.write_log(logpath, 'info', loginfo)

    # preload similar model to save running time
    similar_model = tws.load_wordVectors(modelpath)

    # port number should be changed when demploying
    app.run(debug=True, host='0.0.0.0', port=8888)
Exemplo n.º 52
0
class WhooshBoostBackendTestCase(TestCase):
    def setUp(self):
        super(WhooshBoostBackendTestCase, self).setUp()

        # Stow.
        temp_path = os.path.join('tmp', 'test_whoosh_query')
        self.old_whoosh_path = settings.HAYSTACK_CONNECTIONS['whoosh']['PATH']
        settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'] = temp_path

        self.old_ui = connections['whoosh'].get_unified_index()
        self.ui = UnifiedIndex()
        self.wmmi = WhooshBoostMockSearchIndex()
        self.ui.build(indexes=[self.wmmi])
        self.sb = connections['whoosh'].get_backend()
        connections['whoosh']._index = self.ui

        self.sb.setup()
        self.raw_whoosh = self.sb.index
        self.parser = QueryParser(self.sb.content_field_name,
                                  schema=self.sb.schema)
        self.sb.delete_index()
        self.sample_objs = []

        for i in range(1, 5):
            mock = AFourthMockModel()
            mock.id = i

            if i % 2:
                mock.author = 'daniel'
                mock.editor = 'david'
            else:
                mock.author = 'david'
                mock.editor = 'daniel'

            mock.pub_date = date(2009, 2, 25) - timedelta(days=i)
            self.sample_objs.append(mock)

    def tearDown(self):
        if os.path.exists(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH']):
            shutil.rmtree(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'])

        settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'] = self.old_whoosh_path
        connections['whoosh']._index = self.ui
        super(WhooshBoostBackendTestCase, self).tearDown()

    @unittest.expectedFailure
    def test_boost(self):
        self.sb.update(self.wmmi, self.sample_objs)
        self.raw_whoosh = self.raw_whoosh.refresh()
        searcher = self.raw_whoosh.searcher()
        self.assertEqual(
            len(searcher.search(self.parser.parse(u'*'), limit=1000)), 2)

        results = SearchQuerySet('whoosh').filter(
            SQ(author='daniel') | SQ(editor='daniel'))

        self.assertEqual([result.id for result in results], [
            'core.afourthmockmodel.1',
            'core.afourthmockmodel.3',
        ])
        self.assertEqual(results[0].boost, 1.1)
#imports
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from pprint import pprint
from whoosh.fields import Schema, TEXT

#busca
busca = input("Busca:")
query = busca.split(" ")
print(query)

try:
    schema = Schema(titulo=TEXT(stored=True),
                    conteudo_tkn=TEXT(stored=True),
                    conteudo_full=TEXT(stored=True))
    parser = QueryParser("conteudo_tkn", schema)
    myquery = parser.parse("eu OR você")

    ix = open_dir("whoosh_index")
    with ix.searcher() as searcher:
        results = searcher.search(myquery, terms=True)
        print("Retrieved: ", len(results), ' documents!')
        for ri in results:
            print('score:', ri.score, 'of document:', ri.docnum)
except:
    ix.close()

print("Search: done. :D")
Exemplo n.º 54
0
# coding=utf-8
from whoosh import scoring
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from whoosh import query

idx_dir = 'lagou_idx'
ix = open_dir(idx_dir)
searcher = ix.searcher()

from whoosh import qparser
og = qparser.OrGroup.factory(0.9)
parser = QueryParser("desc", schema=ix.schema, group=og)

# Single field parser.
k = u'Java Python city:上海'
q = parser.parse(k)

results = searcher.search_page(q, 1, pagelen=5)

print(u'{0} results found for keyword {1}, {2} returned: '.format(
    len(results), k, results.scored_length()))
for hit in results[:50]:
    print(hit['id'])
    print(hit['name'])
    # print(hit['city'])
    print(hit['com_name'])
    print('************')
Exemplo n.º 55
0
def do_repository():
    type = request.query.get('type')
    view = request.query.get('view')

    if view =='property':
     ix = index.open_dir(index_path1)

    elif view =='seed':
     ix = index.open_dir(index_path2)


    query = QueryParser('target', schema=ix.schema)

    resultset={}

    if type.lower()=='node':
      q = query.parse('node',)
      with ix.searcher() as s:
        results = s.search(q,limit=None)

        mylist=[]
        d={}
        for hit in results:
         d["query_id"]=hit["query_id"]
         d["content"]=hit["content"]
         d["graph"]=hit["network_name"]
         d["results"]=hit["results"]
         d["target"]=hit["target"]
         mylist.append(d.copy())

         resultset = {"node_queries":mylist}
        return json.dumps(resultset)

    elif type.lower()=='edge':
      q = query.parse('edge',)
      with ix.searcher() as s:
        results = s.search(q,limit=None)

        mylist=[]
        d={}
        for hit in results:
         d["query_id"]=hit["query_id"]
         d["content"]=hit["content"]
         d["graph"]=hit["network_name"]
         d["results"]=hit["results"]
         d["target"]=hit["target"]
         mylist.append(d.copy())
         resultset = {"edge_queries":mylist}
        return json.dumps(resultset)

    elif type.lower()=='all':
      q = query.parse('node',)
      with ix.searcher() as s:
        results = s.search(q,limit=None)

        mylist1=[]
        d={}
        for hit in results:
         d["query_id"]=hit["query_id"]
         d["content"]=hit["content"]
         d["graph"]=hit["network_name"]
         d["results"]=hit["results"]
         d["target"]=hit["target"]
         mylist1.append(d.copy())


      q = query.parse('edge',)
      with ix.searcher() as s:
        results = s.search(q,limit=None)

        mylist2=[]
        d={}
        for hit in results:
         d["query_id"]=hit["query_id"]
         d["content"]=hit["content"]
         d["graph"]=hit["network_name"]
         d["results"]=hit["results"]
         d["target"]=hit["target"]
         mylist2.append(d.copy())


        resultset={}
        resultset["node_queries"]=mylist1
        resultset["edge_queries"]=mylist2

        return json.dumps(resultset)
    else:
        return HTTPResponse(status=400, body="Invalid Input")
Exemplo n.º 56
0
    def index(self, repo_name=None):
        c.repo_name = repo_name
        c.formated_results = []
        c.runtime = ''
        c.cur_query = request.GET.get('q', None)
        c.cur_type = request.GET.get('type', 'content')
        c.cur_search = search_type = {
            'content': 'content',
            'commit': 'message',
            'path': 'path',
            'repository': 'repository'
        }.get(c.cur_type, 'content')

        index_name = {
            'content': IDX_NAME,
            'commit': CHGSET_IDX_NAME,
            'path': IDX_NAME
        }.get(c.cur_type, IDX_NAME)

        schema_defn = {
            'content': SCHEMA,
            'commit': CHGSETS_SCHEMA,
            'path': SCHEMA
        }.get(c.cur_type, SCHEMA)

        log.debug('IDX: %s' % index_name)
        log.debug('SCHEMA: %s' % schema_defn)

        if c.cur_query:
            cur_query = c.cur_query.lower()
            log.debug(cur_query)

        if c.cur_query:
            p = safe_int(request.GET.get('page', 1), 1)
            highlight_items = set()
            try:
                idx = open_dir(config['app_conf']['index_dir'],
                               indexname=index_name)
                searcher = idx.searcher()

                qp = QueryParser(search_type, schema=schema_defn)
                if c.repo_name:
                    cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
                try:
                    query = qp.parse(unicode(cur_query))
                    # extract words for highlight
                    if isinstance(query, Phrase):
                        highlight_items.update(query.words)
                    elif isinstance(query, Prefix):
                        highlight_items.add(query.text)
                    else:
                        for i in query.all_terms():
                            if i[0] in ['content', 'message']:
                                highlight_items.add(i[1])

                    matcher = query.matcher(searcher)

                    log.debug('query: %s' % query)
                    log.debug('hl terms: %s' % highlight_items)
                    results = searcher.search(query)
                    res_ln = len(results)
                    c.runtime = '%s results (%.3f seconds)' % (res_ln,
                                                               results.runtime)

                    def url_generator(**kw):
                        q = urllib.quote(safe_str(c.cur_query))
                        return update_params("?q=%s&type=%s" \
                        % (q, safe_str(c.cur_type)), **kw)

                    repo_location = RepoModel().repos_path
                    c.formated_results = Page(WhooshResultWrapper(
                        search_type, searcher, matcher, highlight_items,
                        repo_location),
                                              page=p,
                                              item_count=res_ln,
                                              items_per_page=10,
                                              url=url_generator)

                except QueryParserError:
                    c.runtime = _('Invalid search query. Try quoting it.')
                searcher.close()
            except (EmptyIndexError, IOError):
                log.error(traceback.format_exc())
                log.error('Empty Index data')
                c.runtime = _('There is no index to search in. '
                              'Please run whoosh indexer')
            except (Exception):
                log.error(traceback.format_exc())
                c.runtime = _('An error occurred during this search operation')

        # Return a rendered template
        return render('/search/search.html')
Exemplo n.º 57
0
from whoosh.index import create_in
from whoosh.fields import *

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
ix = create_in('idx', schema)
writer = ix.writer()
writer.add_document(title=u'First Document',
                    path=u'/a',
                    content=u'This is the first document we have added!')
writer.add_document(title=u'Second Document',
                    path=u'/b',
                    content=u'The second one is even more interesting!')
writer.commit()

from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser('content', ix.schema).parse('first')
    results = searcher.search(query)
    for hit in results:
        print(hit)
Exemplo n.º 58
0
    elif Score == "tf":
        myscore = scoring.Frequency()
    elif Score == "multi":
        myscore = scoring.MultiWeighting(scoring.BM25F(), id=scoring.Frequency(), keys=scoring.TF_IDF())
    else:
        myscore = scoring.BM25F()

    #---------------Input Query----------------------
    schema = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()),
                    content=TEXT(stored=True))
    All_Result = []
    ix = open_dir(InputIndexDir)
    sf = torch.nn.Softmax(dim=0)
    alldata = read_json(input_data_file)
    with ix.searcher(weighting=myscore) as searcher:
        parser = QueryParser("title", ix.schema,group=qparser.OrGroup)
        for item in tqdm(alldata):
            search_result= {}
            for keyword,plau_en_mentions in item['plausible_en_mentions'].items():
                per_uris = []
                per_search_result =[]
                for (word, score) in plau_en_mentions[0:Pivots_N]:
                    query = parser.parse(word)
                    results = SearchQuery(searcher, query, Search_N)
                    hit_score = [hit.score for hit in results]
                    new_score = sf(torch.Tensor(hit_score)).tolist()
                    new_score = [score * s for s in new_score]
                    hit_title = [hit['title'] for hit in results]
                    hit_content = [hit['content'] for hit in results]
                    per_search_result.extend(list(zip(hit_title, hit_content, new_score)))
                for c_result in per_search_result:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# remove a article from the index index

import sys
import config

input_xml_file_name = config.input_xml_file_name

sys.path.append('..')
from whoosh.qparser import QueryParser
from whoosh.index import open_dir

if len(sys.argv) > 1:
    article_title = sys.argv[1]
else:
    print "Usage remove_from_index.py article"

ix = open_dir("index_dir")

query = QueryParser("title", ix.schema).parse("'%s'" % unicode(article_title))
ix.delete_by_query(query)
ix.writer().commit()
Exemplo n.º 60
0
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import open_dir
import sys

ix = open_dir("index")

print("==========SUMMARY SEARCH ENGINE==========")
query_str = input("Search: ")
print("\nResults: ")

with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = QueryParser("synopsis", schema=ix.schema).parse(query_str)
    results = searcher.search(query, limit=1000)

    last = len(results)
    lastPage = last / 2 / 10.0
    if (lastPage % 10 != 0):
        lastPage = int(lastPage) + 1
    else:
        lastPage = int(lastPage)
    pageNum = 1

    def openRes(results, choose, pageNum):
        print("Title:", results[choose * 2 - 2]['title'])
        print("Author:", results[choose * 2 - 2]['author'])
        print("Date:", results[choose * 2 - 2]['date'])
        print("Genre:", results[choose * 2 - 2]['genre'])
        print("Summary:\n", results[choose * 2 - 2]['synopsis'])
        action = input("Return to search? (y/n):")
        if (action == 'y' or action == 'Y'):