Exemplo n.º 1
0
def test_correct_query():
    schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(a=u("alfa bravo charlie delta"))
    w.add_document(a=u("delta echo foxtrot golf"))
    w.add_document(a=u("golf hotel india juliet"))
    w.add_document(a=u("juliet kilo lima mike"))
    w.commit()

    s = ix.searcher()
    qp = QueryParser("a", ix.schema)
    qtext = u('alpha ("brovo november" OR b:dolta) detail')
    q = qp.parse(qtext, ix.schema)

    c = s.correct_query(q, qtext)
    assert c.query.__unicode__() == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)'
    assert c.string == 'alfa ("bravo november" OR b:dolta) detail'

    qtext = u('alpha b:("brovo november" a:delta) detail')
    q = qp.parse(qtext, ix.schema)
    c = s.correct_query(q, qtext)
    assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)'
    assert c.string == 'alfa b:("brovo november" a:delta) detail'

    hf = highlight.HtmlFormatter(classname="c")
    assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
Exemplo n.º 2
0
	def search(self, query, page= -1, page_size=10):
		search_results = list()
		qp_artist = QueryParser('title', self.artist_searcher.schema)
		query = qp_artist.parse(unicode(query))
		if page < 1:
			artist_hits = self.artist_searcher.search(query, limit=None, sortedby='title')
		else:
			artist_hits = self.artist_searcher.search(query, page, page_size, sortedby='title')
		for hit in artist_hits:
			search_results.append(SearchResult(self._artist_from_document(hit), Type.ARTIST))

		qp_album = QueryParser('title', self.album_searcher.schema)
		query = qp_album.parse(unicode(query))
		if page < 1:
			album_hits = self.album_searcher.search(query, limit=None, sortedby='title')
		else:
			album_hits = self.album_searcher.search(query, page, page_size, sortedby='title')
		for hit in album_hits:
			search_results.append(SearchResult(self._album_from_document(hit), Type.ALBUM))

		qp_track = QueryParser('title', self.track_searcher.schema)
		query = qp_track.parse(unicode(query))
		if page < 1:
			track_hits = self.track_searcher.search(query, limit=None, sortedby='title')
		else:
			track_hits = self.track_searcher.search(query, page, page_size, sortedby='title')
		for hit in track_hits:
			search_results.append(SearchResult(self._track_from_document(hit), Type.TRACK))

		return search_results
Exemplo n.º 3
0
def test_correct_query():
    schema = fields.Schema(a=fields.TEXT(), b=fields.TEXT)
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(a=u"alfa bravo charlie delta")
            w.add_document(a=u"delta echo foxtrot golf")
            w.add_document(a=u"golf hotel india juliet")
            w.add_document(a=u"juliet kilo lima mike")

        with ix.searcher() as s:
            qp = QueryParser("a", ix.schema)
            qtext = u'alpha ("brovo november" OR b:dolta) detail'
            q = qp.parse(qtext, ix.schema)

            c = s.correct_query(q, qtext)
            cq = c.query
            assert isinstance(cq, query.And)
            assert cq[0].text == "alfa"
            assert isinstance(cq[1], query.Or)
            assert isinstance(cq[1][0], query.Phrase)
            assert cq[1][0].words == ["bravo", "november"]

            qtext = u'alpha b:("brovo november" a:delta) detail'
            q = qp.parse(qtext, ix.schema)
            c = s.correct_query(q, qtext)
            assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)'
            assert c.string == 'alfa b:("brovo november" a:delta) detail'

            hf = highlight.HtmlFormatter(classname="c")
            assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
Exemplo n.º 4
0
def test_wildcard_existing_terms():
    s = fields.Schema(key=fields.ID, value=fields.TEXT)
    ix = RamStorage().create_index(s)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta"))
    w.add_document(key=u("a"), value=u("boggle echo render rendering renders"))
    w.commit()
    r = ix.reader()
    qp = QueryParser("value", ix.schema)

    def words(terms):
        z = []
        for t in terms:
            assert t[0] == "value"
            z.append(t[1])
        return " ".join(sorted(z))

    q = qp.parse(u("b*"))
    ts = q.existing_terms(r)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "bear boggle bravo")

    q = qp.parse(u("[a TO f]"))
    ts = q.existing_terms(r)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "alfa bear boggle bravo charlie delta echo")

    q = query.Variations("value", "render")
    ts = q.existing_terms(r, expand=False)
    assert_equal(ts, set())
    ts = q.existing_terms(r, expand=True)
    assert_equal(words(ts), "render rendering renders")
Exemplo n.º 5
0
Arquivo: find.py Projeto: apit/rinjani
def find(q):
    ix = Index()
    parser = QueryParser("content", schema=SCHEMA)
    print parser.parse(unicode(q))
    results = ix.find(q)
    if len(results):
        print "Found in %d documents" % len(results)
    else:
        print "Not found"
Exemplo n.º 6
0
    def _search_tag_groups(self, is_filtering_tags):
        seen = None
        query_parser = QueryParser("tag", self._index.schema)
        options = {"limit": None, "groupedby": sorting.FieldFacet("tag", allow_overlap=True), "maptype": sorting.Count}

        with self._index.searcher() as searcher:
            total = searcher.search(query_parser.parse("*"), **options).groups()
            if not is_filtering_tags:
                seen = searcher.search(query_parser.parse("* AND flags:%s" % Status.SEEN), **options).groups()
        return seen, total
Exemplo n.º 7
0
class WhooshGuess(object):
    def __init__(self):
        self.storage = RamStorage()
        schema = Schema(key=ID(stored=True), \
                ask=BOOLEAN(stored=True), \
                content=TEXT(stored=True, analyzer=RegexTokenizer()))
        self.ix = self.storage.create_index(schema)
        self.writer = self.ix.writer()
        self.is_train = False

        for s in greeting.split('\n'):
            self.train(u'matchinggreeting', s)
    
    @property
    def is_ok(self):
        return self.is_train

    def train(self, key, line):
        splits = u' '.join(list(lang.tokenizezh(line)))
        ask = lang.is_question(key)
        #print ask
        #print splits
        self.writer.add_document(key=key, content=splits, ask=ask)

    def train_ok(self):
        self.writer.commit(optimize=True)
        self.searcher = self.ix.searcher()
        self.parser = QueryParser("content", schema=self.ix.schema)
        self.is_train = True

    def guess(self, s, is_ask = None):
        assert(self.is_train)

        keys = list(lang.keyword(s))
        if len(keys) == 0:
            return ''
        
        # MUST contain the keys
        keys = u' '.join(keys)
        splits = u' '.join(list(lang.tokenizezh(s)))
        #q = self.parser.parse(splits + ' OR ' + keys)
        q1 = self.parser.parse(keys)
        q2 = self.parser.parse(splits)
        q = q1 | q2
        #print unicode(q)

        if not is_ask:
            ask = query.Term(u"ask", lang.is_question(s))
        else:
            ask = query.Term(u"ask", is_ask)
        results = self.searcher.search(q, filter=ask)
        for hit in results:
            return hit['key']
        return ''
Exemplo n.º 8
0
    def update_changeset_index(self):
        idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

        with idx.searcher() as searcher:
            writer = idx.writer()
            writer_is_dirty = False
            try:
                indexed_total = 0
                repo_name = None
                for repo_name, repo in self.repo_paths.items():
                    # skip indexing if there aren't any revs in the repo
                    num_of_revs = len(repo)
                    if num_of_revs < 1:
                        continue

                    qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                    q = qp.parse(u"last:t AND %s" % repo_name)

                    results = searcher.search(q)

                    # default to scanning the entire repo
                    last_rev = 0
                    start_id = None

                    if len(results) > 0:
                        # assuming that there is only one result, if not this
                        # may require a full re-index.
                        start_id = results[0]['raw_id']
                        last_rev = repo.get_changeset(revision=start_id).revision

                    # there are new changesets to index or a new repo to index
                    if last_rev == 0 or num_of_revs > last_rev + 1:
                        # delete the docs in the index for the previous
                        # last changeset(s)
                        for hit in results:
                            q = qp.parse(u"last:t AND %s AND raw_id:%s" %
                                            (repo_name, hit['raw_id']))
                            writer.delete_by_query(q)

                        # index from the previous last changeset + all new ones
                        indexed_total += self.index_changesets(writer,
                                                repo_name, repo, start_id)
                        writer_is_dirty = True
                log.debug('indexed %s changesets for repo %s' % (
                             indexed_total, repo_name)
                )
            finally:
                if writer_is_dirty:
                    log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
                    writer.commit(merge=True)
                    log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
                else:
                    writer.cancel
                    log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
Exemplo n.º 9
0
    def GET(self):
        url = web.input().get('url')

        qp = QueryParser('url', schema = ix.schema)
        q = qp.parse(url)
        r = searcher.search(q, limit = 1)
        doc = list(r)[0]

        qp = QueryParser('refers_to', schema = ix.schema)
        q = qp.parse(url)
        refs = searcher.search(q, limit = 25)

        return render.show(doc, refs, DocumentSearcher(ix))
Exemplo n.º 10
0
def test_query_terms():
    qp = QueryParser("a", None)

    q = qp.parse("alfa b:(bravo OR c:charlie) delta")
    assert sorted(q.iter_all_terms()) == [("a", "alfa"), ("a", "delta"),
                                          ("b", "bravo"), ("c", "charlie")]

    q = qp.parse("alfa brav*")
    assert sorted(q.iter_all_terms()) == [("a", "alfa")]

    q = qp.parse('a b:("b c" d)^2 e')
    tokens = [(t.fieldname, t.text, t.boost) for t in q.all_tokens()]
    assert tokens == [('a', 'a', 1.0), ('b', 'b', 2.0), ('b', 'c', 2.0),
                      ('b', 'd', 2.0), ('a', 'e', 1.0)]
Exemplo n.º 11
0
def bm25_retrieve (query, num_res):
	ix = open_dir('index')
	searcher = ix.searcher()
	query_terms = query.split(' ')
	bool_query = ''
	for term in query_terms:
		bool_query += term + ' OR '
	parser = QueryParser("content", ix.schema)
	real_query = parser.parse(bool_query)
	results = searcher.search(real_query, limit = num_res)
	
	new_results = {}
	res_len = len(results)
	
	#assume that top 10 results is relevant
	
	ri = {}
	ni = {}
	R = 10
	N = res_len

	for term in query_terms:
		ri[term] = 0
		ni[term] = 0
	
	#for each term in the query, calculate its ri and ni
	for term in query_terms:
		for res in searcher.search(real_query):
			if term in res['content']:
				ri[term] += 1
		parser = QueryParser("content", ix.schema)
		term_query = parser.parse(term)
		ni[term] = len(searcher.search(term_query, limit = 500))
	
	#for each document, calculate its bm25 score
	if num_res > 10:
		for res in results:
			new_results[res['id']] = 0
		for res in results:
			for term in query_terms:
				reg = re.compile(term)
				#fi is the i's term's frequency in the document
				fi = len(reg.findall(res['content']))
				k1 = 1.5
				b = 0.75
				avdl = 200
				K = k1 * (1 - b + b * len(res['content']) / 200)
				new_results[res['id']] += math.log((ri[term]+0.5)*(N-ni[term]-R+ri[term]+0.5)/(R-ri[term]+0.5)/(ni[term]-ri[term]+0.5)) * (k1 + 1) * fi / (K + fi)
				
	return new_results
Exemplo n.º 12
0
    def contacts(self, query):
        if query:
            to = QueryParser('to', self._index.schema)
            cc = QueryParser('cc', self._index.schema)
            bcc = QueryParser('bcc', self._index.schema)
            with self._index.searcher() as searcher:
                to = searcher.search(to.parse("*%s*" % query), limit=None,
                                     groupedby=sorting.FieldFacet('to', allow_overlap=True)).groups()
                cc = searcher.search(cc.parse("*%s*" % query), limit=None,
                                     groupedby=sorting.FieldFacet('cc', allow_overlap=True)).groups()
                bcc = searcher.search(bcc.parse("*%s*" % query), limit=None,
                                      groupedby=sorting.FieldFacet('bcc', allow_overlap=True)).groups()
                return flatten([to, cc, bcc])

        return []
Exemplo n.º 13
0
def get_answer(message):

    if '/' in message[0]:
        return None

    rx = r'jova,?\s(.+)$'
    m = re.match(rx, message)
    if not m or len(m.groups(1)) < 1:
        return None

    global ix

    search_terms = m.groups(1)[0]
    parser = QueryParser("content", ix.schema)
    qry = parser.parse(search_terms)

    with ix.searcher() as searcher:
        results = searcher.search(qry)
        result = None
        if len(results) == 0:
            return None
        if len(results) == 1:
            result = results[0]
        else:
            result = random.choice(results)

        if result is None or 'path' not in result:
            return None

        return result['path'], 'plain-text'

    return None
Exemplo n.º 14
0
 def searchNote(self):
     pattern = self.searchEdit.text()
     qres = []
     with self.ix.searcher() as searcher:
         queryp = QueryParser("content", self.ix.schema)
         queryp.add_plugin(RegexPlugin())
         query = queryp.parse('r"' + pattern + '"')
                              # r"pattern" is the desired regex term format
         pathFacet = sorting.FieldFacet("path")
         scores = sorting.ScoreFacet()
         results = searcher.search(
             query, limit=None, sortedby=[pathFacet, scores])  # default limit is 10!
         for r in results:
             listItem = QListWidgetItem()
             title = r['title']
             text = r['path']
             term = r.highlights("content")
             qres.append([title, text, term])
         html = """
                 <style>
                     body { font-size: 14px; }
                     .path { font-size: 12px; color: #009933; }
                 </style>
                """
         for ti, te, hi in qres:
             html += ("<p><a href='" + te + "'>" + ti + 
                      "</a><br/><span class='path'>" + 
                     te + "</span><br/>" + hi + "</p>")
         self.searchView.setHtml(html)
Exemplo n.º 15
0
def findsnippets(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True):
    ix = open_dir(indexdir)
    res=[]
    daycount={}
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        myquery = parser.parse(query)
        if daterange!=None:
            datequery=DateRange("date", daterange[0],daterange[1])
            results = searcher.search(datequery & myquery,limit=MAX_SEARCH_RESULTS)
        else:
            results = searcher.search(myquery,limit=MAX_SEARCH_RESULTS)
        if distribution:
            myfacet=Facets().add_field("date",maptype=sorting.Count)
            if daterange!=None:
                datequery=DateRange("date", daterange[0],daterange[1])
                daycount_orig=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)
            else:
                daycount_orig=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)     
            for day in daycount_orig.groups():
                daycount[day]=daycount_orig.groups()[day]
            for result in results[(page-1)*ndocs:page*ndocs]:
                doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d")))
                snippet=result.highlights("content", text=doc.getcontent())
                res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet})
            total_docs=results.estimated_length()
            return res, total_docs, daycount
        else:
            for result in results[(page-1)*ndocs:page*ndocs]:
                doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d")))
                snippet=result.highlights("content", text=doc.getcontent())
                res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet})
        total_docs=results.estimated_length()
        return res, total_docs
Exemplo n.º 16
0
def finddocs(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True):
    ix = open_dir(indexdir)
    res=[]
    daycount={}
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        myquery = parser.parse(query)
        if distribution:
            myfacet=Facets().add_field("date",maptype=sorting.UnorderedList)
            if daterange!=None:
                datequery=DateRange("date", daterange[0],daterange[1])
                results=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)
            else:
                results=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS)
            doc_cnt=0
            for day,docs in results.groups().iteritems():
                daycount[day]=len(docs)
                for result in docs:
                    if doc_cnt in range((page-1)*ndocs,page*ndocs):
                        res.append({'title':searcher.stored_fields(result)['title'],'identifier':searcher.stored_fields(result)['identifier'],'date':searcher.stored_fields(result)['date']})
                    doc_cnt+=1
            total_docs=results.estimated_length()
            return res, total_docs, daycount
        else:
            if daterange!=None:
                datequery=DateRange("date", daterange[0],daterange[1])
                results=searcher.search(datequery & myquery, limit=MAX_SEARCH_RESULTS)
            else:
                results=searcher.search(myquery, limit=MAX_SEARCH_RESULTS)
            for result in results[(page-1)*ndocs:page*ndocs]:
                res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date']})              
        total_docs=results.estimated_length()
        return res, total_docs
Exemplo n.º 17
0
    def searchNote(self):
        """ Sorting criteria: "title > path > content"
            Search matches are organized into html source.
        """

        pattern = self.searchEdit.text()
        if not pattern:
            return
        results = []
        print("Searching using", pattern)
        with self.ix.searcher() as searcher:
            matches = []
            for f in ["title", "path", "content"]:
                queryp = QueryParser(f, self.ix.schema)
                queryp.add_plugin(RegexPlugin())
                # r"pattern" is the desired regex term format
                query = queryp.parse('r"' + pattern + '"')
                ms = searcher.search(query, limit=None) # default limit is 10!
                for m in ms:
                    if not m in matches:
                        matches.append(m)

            for r in matches:
                title = r['title']
                path = r['path']
                term = r.highlights("content")
                results.append([title, path, term])

            html = ""
            for title, path, hi in results:
                html += ("<p><a href='" + path + "'>" + title +
                         "</a><br/><span class='path'>" +
                         path + "</span><br/>" + hi + "</p>")
            self.searchView.setHtml(html)
            print("Finished searching", pattern)
Exemplo n.º 18
0
    def __call__(self, query):
        """search"""
        query = unicode(query)
        query_parser = QueryParser("description", schema=self.ix.schema)
        myquery = query_parser.parse(query)

        # Old code: too strict
        #        extendedquery = Or([myquery] +
        #                           [Term(field, query) for field in self.keywords])

        # New code: too permissive
        #        extendedquery = [myquery]
        excluded = set(["AND", "OR", "NOT"])
        terms = [i for i in query.split() if i not in excluded]
        #        for field in self.keywords:
        #            extendedquery.extend([Term(field, term) for term in terms])
        #        extendedquery = Or(extendedquery)

        # Code should look something like
        # Or([myquery] + [Or(
        # extendedquery = [myquery]
        extendedquery = And(
            [
                Or(
                    [myquery]
                    + [Term("description", term), Term("name", term)]
                    + [Term(field, term) for field in self.keywords]
                )
                for term in terms
            ]
        )

        # perform the search
        searcher = self.ix.searcher()
        return [i["name"] for i in searcher.search(extendedquery, limit=None)]
Exemplo n.º 19
0
    def populateTable(self, searchterm=None):
        self.infoTable.clear()
        self.infoTable.setHorizontalHeaderLabels( [ "Title", "Authors", "Tags", "Year", "Read" ] )
        self.infoTable.setRowCount(0)
        self.infoTable.horizontalHeader().setResizeMode(0, QHeaderView.Stretch)
        self.infoTable.verticalHeader().hide()

        if( searchterm == None or searchterm == "" ):
            papers = KKDocument.objects.all()
            for p in papers:
                a = ', '.join([x.name for x in p.authors.all()])
                t = ', '.join([x.tag  for x in p.tags.all()])
                self.newEntry(p.title, a, t, p.year, p)
            return # were done here - all papers printed

        # only if there is a searchterm:
        # search full text with whoosh
        print "FINDING %s" % searchterm
        searcher = self.whoosh_ix.searcher()
        parser = QueryParser("content", schema = self.whoosh_schema)
        query = parser.parse(unicode(searchterm))
        whoosh_results = searcher.search(query)

        print "FOUND", len(whoosh_results), "Objects"

        for r in whoosh_results:
            p = KKDocument.objects.get(localFile=r['path'])
            a = ', '.join([x.name for x in p.authors.all()])
            t = ', '.join([x.tag  for x in p.tags.all()])
            self.newEntry(p.title, a, t, p.year, p)
Exemplo n.º 20
0
def OnlyOneSearch(queryStr="",index=".index"):
	ix=get_index(index)
	searcher = ix.searcher()
	parser = QueryParser("name", schema = ix.schema)
	query=parser.parse(queryStr)
	results = searcher.search(query)
	return results
Exemplo n.º 21
0
    def search_datasets(self, search_phrase, limit=None):
        """Search for just the datasets."""
        from collections import defaultdict

        from whoosh.qparser import QueryParser

        parser = QueryParser("doc", schema=self.dataset_index.schema)

        query = parser.parse(search_phrase)

        datasets = defaultdict(SearchResult)

        with self.dataset_index.searcher() as searcher:

            results = searcher.search(query, limit=limit)

            for hit in results:

                vid = hit.get('vid')
                bvid = hit.get('bvid')
                type = hit.get('type')


                datasets[bvid].vid = bvid
                if type == 'b':
                    datasets[bvid].bundle_found = True
                    datasets[bvid].b_score += hit.score
                else:
                    datasets[bvid].p_score += hit.score
                    datasets[bvid].partitions.add(vid)

        return datasets
Exemplo n.º 22
0
	def get(self):
	
		wikiResults = None
		jobResults = None
		projectResults = None	
	
		if 'searchScope' in request.args and 'searchTerm' in request.args:	
			
			searchTerm = request.args.get('searchTerm')	
			searchScope = request.args.get('searchScope')	
			index = open_dir('app/search/index')
			parser = QueryParser("content", schema=index.schema)
				
			with index.searcher() as searcher:
			
				if searchScope in ['everything', 'wiki']:
					wikiResults = [{'title':result['title'], 'url':'http://jhcwiki.jhc.co.uk/wiki/index.php/' + result['title'].replace(' ', '_')} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'WIKI']
				
				if searchScope in ['everything', 'jobs']:
					jobResults = [{'title':result['title'], 'url':''} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'JOB']

				if searchScope in ['everything', 'projects']:	
					projectResults = [{'title':result['title'], 'url':url_for('projects.projectDetail', projectCode = result['title'].split('-')[0].strip())} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'PROJECT']
		else:
			searchTerm = ''	
			searchScope = 'everything'
			
		return render_template('search/search.html', wikiResults=wikiResults, jobResults=jobResults , projectResults=projectResults, searchTerm=searchTerm, searchScope=searchScope, title="Search")	
Exemplo n.º 23
0
def grid_search(rookie_avg, surround, fragment_char_limit, whoosh_results, corpus, query_string):
    '''
    find best top parameter for whoosh snips. top parameter controls how many ... delimited fragments
    best = minimize distance w/ average size of rookie snip
    '''
    best = {""}
    best_distance_so_far = 1000000000 
    index = open_dir("indexes/{}/".format(corpus))
    corpusid = getcorpusid(corpus) 
    for top in range(1, 5): # basically fixed for now
        with index.searcher() as srch:
            query_parser = QueryParser("content", schema=index.schema)
            qry = query_parser.parse(query_string)
            results = srch.search(qry, limit=None)
            results.fragmenter.surround = surround
            results.fragmenter.maxchars = fragment_char_limit
            sum_ = 0
            for s_ix, a in enumerate(results):
                path = a.get("path").replace("/", "")
                sents = get_preproc_sentences(path, corpusid)
                sss = unicode(" ".join(sents).encode("ascii", "ignore"))
                sss = str(a.highlights("content", text=sss, top=top))
                sum_ += len(sss)
            diff = abs(rookie_avg - sum_/len(results))
            print "top of {} gives diff of {}".format(top, diff)
        if diff < best_distance_so_far:
            best = top
            best_distance_so_far = diff

    print "best top = {}".format(best)
    return best
Exemplo n.º 24
0
def index_query(environ, **kwargs):
    """
    Return a generator of tiddlers that match
    the provided arguments.
    """
    logging.debug('entering with %s', environ)
    print 'getting called on index_query'
    config = environ['tiddlyweb.config']
    #store = environ['tiddlyweb.store']
    query_parts = []
    for field, value in kwargs.items():
        if field == 'tag':
            field = 'tags'
        query_parts.append('%s:%s' % (field, value))
    query_string = ' '.join(query_parts)

    print 'getting inside on index_query'
    schema = config.get('wsearch.schema', SEARCH_DEFAULTS['wsearch.schema'])
    searcher = get_searcher(config)
    parser = QueryParser('text', schema=Schema(**schema))
    query = parser.parse(query_string)
    logging.debug('query parsed to %s' % query)
    results = searcher.search(query)

    def tiddler_from_result(result):
        print 'r', result
        bag, title = result['id'].split(':', 1)
        tiddler = Tiddler(title, bag)
        return tiddler
        #return store.get(tiddler)

    for result in results:
        yield tiddler_from_result(result)
    return
Exemplo n.º 25
0
def stage3():
    ix = open_dir(index_directory)
    if not ix:
        print "No index"
        return

    parser = QueryParser("content", ix.schema)
    with ix.searcher() as searcher:

        try:
            while True:
                search_phrase = raw_input('Search phrase: ')
                if not search_phrase: continue

                search_phrase = search_phrase.decode(sys.stdin.encoding)

                myquery = parser.parse(search_phrase)
                results = searcher.search(myquery)

                if results:
                    for result in results:
                        print "%s - %s (%s)" % (result['url'],result['title'], result['company'])

                else:
                    print "No matching results"

                print "\r\n"

        except KeyboardInterrupt:
            print "\nBae..."
            return
Exemplo n.º 26
0
def search(q, default_field="content"):
    ix = index.open_dir(SEARCH_INDEX)
    searcher = ix.searcher()
    parser = QueryParser(default_field, schema=ix.schema)
    query = parser.parse(q)
    results = searcher.search(query)
    return results
Exemplo n.º 27
0
def search(request):
    hits = []
    results = []
    query = request.GET.get('q', None)
    newspaper = request.GET.get('newspaper', None)
    if newspaper is not None:
        index_dir = "C:/Django Projects/searcher/modules/index" + newspaper
        ix = index.open_dir(index_dir)
        searcher = ix.searcher()
        if query is not None and query != u"":
            query = query.replace('+', ' AND ').replace(' -', ' NOT ')
            parser = QueryParser("content", schema=ix.schema)
            try:
                qry = parser.parse(query)
            except:
                qry = None
            if qry is not None:
                hits = searcher.search(qry)

        for hit in hits:
            title = hit['title']
            url = hit['url']
            date = hit['date']
            highlights = hit.highlights("content")
            keywords_list = [keyword for keyword, score in searcher.key_terms_from_text("content", hit['content'])]
            keywords = ", ".join(keywords_list)
            results.append(Result(title,url,date,highlights,keywords))
            


    variables = RequestContext(request, {
        'query': query,
        'hits': results
    })
    return render_to_response('search.html', variables)
Exemplo n.º 28
0
def crearEsquemaCorreo():
    correo1 = "1.txt"

    correoEsquema = Schema(remitente=ID(stored=True), destinatarios=KEYWORD(stored=True), fecha=DATETIME(stored=True),asunto=KEYWORD(stored=True), cuerpo=TEXT(stored=True))

    if not os.path.exists("indexCorreo"):
        os.mkdir("indexCorreo")

    iC = index.create_in("indexCorreo", correoEsquema)
    iC = open_dir("indexCorreo")
    
    writer = iC.writer()
    fecha = "20101015"
    date_email = datetime.strptime(fecha, "%Y%m%d")
    writer.add_document(remitente=u"unoarrobagmail.com", destinatarios=u"dosarrobagmail.com tresarrobagmail.com", fecha=date_email,asunto=u"Contrato de compraventa con la constructora",cuerpo=u"Estimados socios: ya hemos firmado el contrato de compraventa con el cliente preferencial. Espero noticias vuestras. Un saludo,")
    #writer.add_document(email=u"dosarrobagmail.com", name=u"Pedro Guerra")
    #writer.add_document(email=u"tresarrobagmail.com", name=u"Ana Montero")
    #writer.add_document(email=u"cuatroarrobagmail.com", name=u"Luis Pontes")
    writer.commit()
    
    qp = QueryParser("remitente", schema=iC.schema)
    q = qp.parse(u"unoarrobagmail.com")

    with iC.searcher() as s:
        results = s.search(q)
        print results[0]
Exemplo n.º 29
0
def search_files(index_dir, content):
    """
	search file content in index 
	if not hit: return False
	if hit: return results
	"""
    index_exist = index.exists_in(index_dir)
    if not index_exist:
        print ("index not exist")
        return False
    ix = index.open_dir(index_dir)
    content = unicode(content)
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        query = parser.parse(content)
        # whoosh.searching.Results
        results = searcher.search(query)
        print (type(results))
        l = len(results)
        print l
        for h in results:
            # whoosh.searching.Hit
            print type(h)
            print h
        return results
    return False
Exemplo n.º 30
0
def test_correct_spell_field():
    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True))
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(text=u"rendering shading modeling reactions")

        with ix.searcher() as s:
            text = s.schema["text"]
            spell_text = s.schema["spell_text"]

            r = s.reader()
            words = [text.from_bytes(t) for t in r.lexicon("text")]
            assert words == ["model", "reaction", "render", "shade"]

            words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")]
            assert words == ["modeling", "reactions", "rendering", "shading"]

            qp = QueryParser("text", s.schema)
            qtext = u"renderink"
            q = qp.parse(qtext, s.schema)

            r = s.search(q)
            assert len(r) == 0

            c = s.correct_query(q, qtext)
            assert c.string == "rendering"
            assert c.query == query.Term("text", "rendering")

            hf = highlight.HtmlFormatter(classname="c")
            assert c.format_string(hf) == '<strong class="c term0">rendering</strong>'
Exemplo n.º 31
0
def search_song_by_title(title, index):
    results_list = list()
    qp = QueryParser('title', schema=index.schema)
    q = qp.parse(u"{}".format(title))
    with index.searcher() as searcher:
        results = searcher.search(q)
        for result in results:
            data = {
                'title': result['title'],
                'artist': result['artist'],
                'full_lyrics': result['full_lyrics'],
                'lyrics': result['lyrics'],
                'album': result['album']
            }
            results_list.append(data)
    return results_list
Exemplo n.º 32
0
    def lookup(self, key, field="entity_id"):
        if key == 'entities' or key is None:
            return self._entities()

        key = self._prep_key(key)
        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                e = self.objects.get(result['object_id'], None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))
Exemplo n.º 33
0
def testBuscarPalabrasClave(autor):
    """
    Busca las palabras clave del texto que acompaña al cuadro

    """
    ix = whoosh.index.open_dir("ficheros/index")
    parser = QueryParser("autor", ix.schema)
    myquery = parser.parse(autor)
    with ix.searcher() as searcher:
        results = searcher.search(myquery)
        print results
        keywords = [
            keyword
            for keyword, score in results.key_terms("descripcion", numterms=4)
        ]
    return keywords
Exemplo n.º 34
0
def search(ix, search_key):
    qp = QueryParser("body", schema=ix.schema, termclass=MyFuzzyTerm)
    q = qp.parse(search_key)
    try:
        with ix.searcher() as s:
            results = s.search(q)
            data = [res['data'] for res in results]
        return {
            "hits": len(data),
            "data": data,
            "error": None,
            "msg": "Success"
        }
    except Exception as e:
        return {"hits":None, "data":None, "error":str(e),\
                "msg":"Invalid search / unknown error"}
Exemplo n.º 35
0
def search(query):
    """Search the stopindex for `query` using Whoosh."""
    from info import IDX_NAME

    ix = open_dir(IDX_NAME)
    parser = QueryParser("name", ix.schema)
    q = parser.parse(query)

    with ix.searcher() as s:
        results = s.search(q, limit=100)
        if not results:
            resp = []
        else:
            resp = [(r['name'], r['location'], r['sid']) for r in results]

    return resp
Exemplo n.º 36
0
    def _search(self, query='',  field=None,  index=None, terms=False, limit=None):
        ''' query (exaxct mathch) search '''
        index = index or self._default_index
        ix = self.get_index(index)
        fieldin = field or 'content'

        qp = QueryParser(fieldin, ix.schema)
        qp.add_plugin(ws.qparser.SingleQuotePlugin())
        query = qp.parse(query, normalize=False)
        with ix.searcher() as searcher:
            if terms is True:
                results = searcher.search(query, terms=True, limit=limit).matched_terms()
            else:
                results = list(searcher.search(query, limit=limit).items())

        return results
Exemplo n.º 37
0
def index():
  search_field = request.args.get('search')
  text = request.args.get('word')
  if text == None:
      text = ""
  else:
      pass
#  text = "大学"

  schema = Schema(title=TEXT(stored=True), content=TEXT, url=STORED, created_at=STORED, inn=STORED, out=STORED)
  ix = open_dir('daigo_search_dir')

  t = Tokenizer()
  str_output = ""
  for token in t.tokenize(text):
    str_output += token.surface + " "


  with ix.searcher() as searcher:

      query_ = QueryParser("content", ix.schema)
      query = query_.parse(str_output)
      results = searcher.search(query)
      if int(results.estimated_length())>2:
          pass
      else:
          query = QueryParser("content", ix.schema, group=OrGroup).parse(str_output) # OR検索へ
          results = searcher.search(query)

      titles = []
      timestamps = []
      urls = []

      oldr =""
      for r in results:
          if r != oldr:
              titles.append(r['title'])
              timestamps.append(r['inn']+"~"+r["out"])
              urls.append(r["url"])
          oldr = r

      df = pd.DataFrame(columns=['動画タイトル', 'URL','この辺りかも'])
      df["動画タイトル"] = titles
      df["この辺りかも"] = timestamps
      df["URL"] = urls

  return render_template('search.html',word=text, search_result=df.to_html(classes='books',index=False,justify="center"))
Exemplo n.º 38
0
def clusterVars(ind, path):
	chars = []

	# Importance (# lines of dialogue)
	nameParser = QueryParser("name", schema=ind.schema)
	for character in lists.characters:
		query = nameParser.parse(character["Name"][0])
		with ind.searcher() as searcher:
			results = searcher.search(query, limit=None)
			obj = {}
			obj["Name"] = character["Name"][0]
			obj["NumQuotes"] = 0
			for result in results:
				obj["NumQuotes"] += len(result['quote'].split('\n'))
			chars.append(obj)
	
	# Interest in Luke (Luke mentions), Force / Dark Side mentions
	query1 = Or([Phrase("quote", ["red","five"]),Term("quote","luke"),Term("quote","skywalker")])
	query2 = Or([Phrase("quote",["dark","side"]),Term("quote","force")])

	with ind.searcher() as searcher:
		results1 = searcher.search(query1, limit=None)
		results2 = searcher.search(query2, limit=None, terms=True)
		for character in chars:
			character["InterestInLuke"] = 0
			character["ForceMentions"] = 0
			for result1 in results1:
				if result1['name'] == character["Name"].upper():
					character["InterestInLuke"] += 1
			for result2 in results2:
				if result2['name'] == character["Name"].upper():
					character["ForceMentions"] += 1

	# Take proportions
	for character in chars:
		if character["NumQuotes"] > 0:
			character["InterestInLuke"] = (character["InterestInLuke"] + 0.0) / character["NumQuotes"]
			character["ForceMentions"] = (character["ForceMentions"] + 0.0) / character["NumQuotes"]

	# Character sentiment average
	charSents = charSentiment(ind, path)
	for charSent in charSents:
		for character in chars:
			if character["Name"] == charSent:
				character["Sentiment"] = charSents[charSent]

	return chars
Exemplo n.º 39
0
    def update_index_emailthreads(self, groupsio_token, config):
        """
        Update the search index using the email archives
        of groups.io subgroups. This method uses the Groups.io
        API via methods defined in groupsio_util.py
        """

        # Get the set of indexed ids:
        # ------
        indexed_ids = set()
        p = QueryParser("kind", schema=self.ix.schema)
        q = p.parse("emailthread")
        with self.ix.searcher() as s:
            results = s.search(q,limit=None)
            for result in results:
                indexed_ids.add(result['id'])


        # Get the set of remote ids:
        # ------

        archive = get_mbox_archives(groupsio_token)

        writer = self.ix.writer()
        count = 0

        # archives is a dictionary
        # keys are IDs (urls)
        # values are dictionaries

        # Start by collecting all the things
        remote_ids = set()
        for k in archives.keys():
            remote_ids.add(k)

        # drop indexed_ids
        for drop_id in indexed_ids:
            writer.delete_by_term('id',drop_id)

        # add remote_ids
        for add_id in remote_ids:
            item = archives[add_id]
            self.add_emailthread(writer, item, config, update=False)
            count += 1

        writer.commit()
        print("Done, updated %d Groups.io email threads in the index" % count)
Exemplo n.º 40
0
    def mostrar_lista(event):
        lb.delete(0, END)

        ix = open_dir(dir_index)
        with ix.searcher() as searcher:
            my_query = str(
                en_fecha_comienzo.get()) + " TO " + str(en_fecha_fin)
            qp = QueryParser("fechaPublicacion", ix.schema)
            q = qp.parse(my_query)

            results = searcher.search(q)
            for r in results:
                lb.insert(END, "Titulo: " + r['titulo'])
                lb.insert(
                    END, "Fecha publicacion: " +
                    r['fechaPublicacion'].strftime('%Y/%m/%d'))
                lb.insert(END, "")
Exemplo n.º 41
0
def delete(cached):
    "Remove file from index."
    ix = open_dir(DIRECTORY, NAME)
    with ix.searcher() as searcher, ix.writer() as w:
        qp = QueryParser(u'cached', ix.schema)
        q = qp.parse(unicode(cached))
        results = searcher.search(q)
        if len(results) == 0:
            # Should only happen if user hasn't done run skid-update since
            # adding the paper being deleted.
            print 'Cached file %r not found in index.' % cached
        elif len(results) == 1:
            w.delete_document(results[0].docnum)
            return True
        else:
            assert False, 'This should never happen. ' \
                'Multiple (%s) results for %r found for cached file.' % (len(results), cached)
Exemplo n.º 42
0
    def test_multi(self):
        import transaction
        from whoosh_tm.datamanager import WhooshDataManager
        from whoosh.qparser import QueryParser
        from whoosh.index import create_in, open_dir
        import threading

        with TempDirectory() as d:
            ix1 = create_in(d.path, dummy_schema)
            ix2 = open_dir(d.path)

            def add_document1():
                dm1 = WhooshDataManager(ix1)
                t = transaction.get()
                t.join(dm1)
                dm1.add_document(
                    title=u"First document",
                    path=u"/a",
                    content=u"This is the first document we've added!")
                transaction.commit()

            def add_document2():
                dm2 = WhooshDataManager(ix2)
                t = transaction.get()
                t.join(dm2)

                dm2.add_document(
                    title=u"Second document",
                    path=u"/b",
                    content=u"The second one is even more interesting!")
                transaction.commit()

            th1 = threading.Thread(target=add_document1)
            th2 = threading.Thread(target=add_document2)
            th1.start()
            th2.start()
            th1.join()
            th2.join()

            ix3 = open_dir(d.path)
            with ix3.searcher() as searcher:
                parser = QueryParser("content", ix3.schema)
                results = searcher.search(parser.parse('second'))
                self.assertEqual(len(results), 1)

                self.assertEqual(results[0]["title"], "Second document")
Exemplo n.º 43
0
def more_like_this(request, pid):
    ix = index.open_dir(settings.WHOOSH_INDEX)
    searcher = ix.searcher()
    qp = QueryParser("pid", schema=ix.schema)
    qq = qp.parse(pid)
    doc = searcher.search(qq)

    first = doc[0]
    title = "%s: %s" % (first['type'], first['title'])

    res = first.more_like_this("content", numterms=NUM_TERMS)
    res = map(decorate, res)
    ix.close()

    messages.info(request, 'Posts similar to <b>%s</b>' % title)

    return res
def ApartadoB(fecha):        
    ix = open_dir("index")
    qp = QueryParser("fecha", schema=ix.schema)
    query = unicode("'"+fecha+" to today'")
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(query)
    print(q)
    s=ix.searcher()
    results = s.search(q)
    print(results)
    for n in results:
        print n.get("fecha")
        print n.get("remitente")
        print n.get("destinatarios")
        print n.get("asunto")
        print("*************\n")
    return results
Exemplo n.º 45
0
 def readData(self, data):
     results = None
     # init searcher
     with self.ix.searcher() as searcher:
         parser = QueryParser(data["query_field"],
                              self.ix.schema,
                              plugins=[],
                              group=qparser.OrGroup)
         # parser.add_plugin(qparser.FuzzyTermPlugin())
         myquery = parser.parse(data["query_str"])
         # print("-=-=-=-=-=-=", parser.filters())
         results = [
             dict(d_)
             for d_ in searcher.search(myquery, limit=self.search_limit)
         ]
         # print(parser, myquery)
     return results
Exemplo n.º 46
0
def get_wiki_articles(answers, ix):
    qp = QueryParser("title", schema=ix.schema)
    #print("Searching For: %s"%answers)
    q = qp.parse(answers)

    with ix.searcher() as s:
        results = s.search(q, limit=1)
        fname = None
        if len(results) > 0:
            for result in results:
                fname = result['file_path']
                title = result['title']
                #print("Found: %s"%result['title'])
                text = get_article_text(fname, title)
        else:
            text = None
    return text
Exemplo n.º 47
0
    def getMentionGraph(self, coreUsers: list):
        qp = QueryParser("mentionsUsers", schema=self.ix.schema)
        tq = qp.parse("*")  # something in mentionUsers
        ret = defaultdict(lambda: defaultdict(int))
        with self.getSearcher() as s:
            for uid in coreUsers:
                uq = whoosh.query.Term("user", uid)  # limit to uid

                q = whoosh.query.And([uq, tq])
                res = s.search(q, limit=10000000)
                thisUserContrib = ret[uid]
                for r in res:
                    mentions = r["mentionsUsers"]
                    mentions = mentions.split(",")
                    for m in mentions:
                        thisUserContrib[m] += 1
        return ret
Exemplo n.º 48
0
def full_text_search(q):
    ix = open_dir(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH)

    parser = QueryParser("transcript", ix.schema)
    q = q.decode('utf-8')
    query = parser.parse(q)

    results = []

    with ix.searcher() as searcher:
        res = searcher.search(query)

        for r in res:
            results.append(
                (r.fields()["transcription_id"], r.highlights("transcript")))

    return results
Exemplo n.º 49
0
def search(ix, query, project_ids=[], limit=10):
    query_parser = QueryParser("name", schema=ix.schema)
    whoosh_query = query_parser.parse(query)
    is_project_filter = len(project_ids) > 0
    ids = []
    with ix.searcher() as searcher:
        if is_project_filter:
            project_id_terms = Or(
                [Term("project_id", project_id) for project_id in project_ids])
            results = searcher.search(whoosh_query,
                                      filter=project_id_terms,
                                      limit=limit)
        else:
            results = searcher.search(whoosh_query, limit=limit)
        for result in results:
            ids.append(result["id"])
    return ids
Exemplo n.º 50
0
class HamsterIndex(object):
    def __init__(self, path):
        if not os.path.exists(path):
            os.mkdir(path)
            self.index = create_in(path, MovieSchema)

        self.index = open_dir(path)
        self.q_parser = QueryParser("title", self.index.schema)

    def index_movie(self, movie):
        u = unicode
        cast = movie["cast"]
        actors = [c['name'] for c in cast]
        director = movie["director"]
        directors = [c['name'] for c in cast]

        writer = self.index.writer()
        writer.add_document(imdb_id=u(movie["_id"]),
                            title=u(movie["title"]),
                            genre=u(";".join(movie["genres"])),
                            plot=u(movie.get("plot", "")),
                            cast=u(";".join(actors)),
                            director=u(";".join(directors)),
                            rating=movie["rating"])
        writer.commit()

    def list_all(self):
        num = 0
        retval = []
        with self.index.searcher() as searcher:
            for res in searcher.reader().all_stored_fields():
                num += 1
                retval.append(res)
        return retval

    def query(self, querystring):
        querystring = unicode(querystring)
        myquery = self.q_parser.parse(querystring)

        with self.index.searcher() as searcher:
            results = searcher.search(myquery)
            retval = []
            for res in results:
                retval.append(res.fields())
            return retval
Exemplo n.º 51
0
def busquedaTitulos(descripcion):
    ix = open_dir("index")

    #     print ix.doc_count_all()    #esto te dice cuantos has indexado por si no estas seguro
    #     for r in ix.searcher().documents():
    #         print("entrada: "+str(r))

    qp = QueryParser("descripcion", schema=ix.schema)
    q = qp.parse(unicode(str(descripcion)))

    s = ix.searcher()
    results = s.search(q)
    titulos = []
    for r in results:

        titulos.append(r.get("titulo"))

    return titulos
Exemplo n.º 52
0
def query_article_by_key(key=""):
    """
    查询文章内容通过key
    """
    if len(key) <= 0:
        return []

    result = list()
    index = open_dir(INDEX_PATH)

    searcher = index.searcher()
    parser = QueryParser("content", schema=index.schema)

    result_list = searcher.search(parser.parse(key))
    for hit in result_list:
        result.append(hit)

    return result
Exemplo n.º 53
0
def index_search(search_terms, document_root):
    qp = QueryParser("content", schema=markupserve_index.schema)
    query = qp.parse(safe_unicode(search_terms))

    results = collections.defaultdict(list)

    split_terms = shlex.split(search_terms)

    with markupserve_index.searcher() as searcher:
        query_results = searcher.search(query, limit=None)

        for result in query_results:
            filename = result["path"].decode("utf-8").decode("unicode-escape")
            results[filename].append(
                result.highlights("content").decode("utf-8").decode(
                    "unicode-escape"))

    return results
Exemplo n.º 54
0
    def search(self, searchStr, lang):
        ''' Search for a code snippet based off a search string (searchStr)
    and the language

    s.search("How to append to a list", "Python")
    '''
        with self.ix.searcher() as searcher:
            qp = QueryParser("description", self.ix.schema, group=OrGroup)

            # Since the index is singularized, we must search using a
            # singularized string.
            query = qp.parse(
                unicode("(%s) AND (lang:%s)" %
                        (self.singularize(searchStr), lang)))
            results = searcher.search(query)
            returnThis = [(x['description'].lower(), x['path'])
                          for x in results]
            return returnThis
Exemplo n.º 55
0
    def modify(self, name, metadata):
        """
        Modify a document metadata
        """
        # Let's check if the document exist
        # It must be present in the index
        qp = QueryParser("hash", schema=self.index.schema)
        q = qp.parse(name)
        with self.index.searcher() as s:
            # No results
            if len(s.search(q)) == 0:
                raise IOError("Document does not exist")

        # So we do have a document, we just need to update it
        metadata['hash'] = name

        # Write everything into the index
        self.__update_index(metadata)
Exemplo n.º 56
0
    def test_can_search_id_and_summary_TODO(self):
        #arrange
        self.insert_ticket("test x")
        self.insert_ticket("test 1")

        fieldboosts = dict(
            id=1,
            summary=1,
        )

        mfp = MultifieldPlugin(list(fieldboosts.keys()), )
        pins = [WhitespacePlugin, PhrasePlugin, mfp]
        parser = QueryParser(None, WhooshBackend.SCHEMA, plugins=pins)

        parsed_query = parser.parse("1")
        result = self.whoosh_backend.query(parsed_query)
        self.print_result(result)
        self.assertEqual(2, result.hits)
Exemplo n.º 57
0
 def find(self, text):
     '''
     通过关键字查找一篇文章
     :param text: 关键字
     :return: 返回一个dict,包括path, title, content
     '''
     searcher = self.ix.searcher()
     ret_list = []
     parser = QueryParser("content", schema=self.ix.schema)
     try:
         word = parser.parse(text)
     except:
         word = None
     if word is not None:
         hits = searcher.search(word, limit=None)
         for hit in hits:
             ret_list.append(dict(hit))
     return ret_list
Exemplo n.º 58
0
    def search(self, user_query):
        '''Search the index for wikis that relate to the user's query.'''
        # Exchange some speed by searching for variations of what the user
        # queried for to improve search quality.
        parser = QueryParser('content',
                             schema=self._ix.schema,
                             termclass=query.Variations)
        q = parser.parse(u'{0}'.format(user_query))

        results = []
        with self._ix.searcher() as searcher:
            hits = searcher.search(q)
            hits.fragmenter = self._context_fragmenter
            for hit in hits:
                results.append(
                    SearchResult(hit['path'], hit.highlights('content')))

        return results
Exemplo n.º 59
0
    def find(self, querystring, parser=None, **kwargs):
        """Parses querystring, runs the query in this index, and returns a
        Result object. Any additional keyword arguments are passed to
        Searcher.search() along with the parsed query.

        :querystring: The query string to parse and search for.
        :parser: A Parser object to use to parse 'querystring'.
            The default is to use a standard qparser.QueryParser.
            This object must implement a parse(str) method which returns a
            query.Query instance.
        :*returns*: searching.Results
        """

        if parser is None:
            from whoosh.qparser import QueryParser
            parser = QueryParser(self.schema)

        return self.searcher().search(parser.parse(querystring), **kwargs)
Exemplo n.º 60
0
 def search_similar(self, entity, skip=[]):
     with self.index.searcher() as searcher:
         qp = QueryParser("fingerprint", schema=self.index.schema)
         # parser.add_plugin(qparser.FuzzyTermPlugin())
         tokens = set()
         for fp in entity.fingerprints:
             tokens.update(fp.split())
         if entity.country:
             tokens.add('country:%s' % entity.country)
         tokens = ' OR '.join(tokens)
         tokens = ['(%s)' % tokens]
         for uid in skip:
             tokens.append('(NOT uid:%s)' % uid)
         q = ' AND '.join(tokens)
         q = qp.parse(q)
         restrict_q = Term("uid", entity.uid)
         for result in searcher.search(q, mask=restrict_q):
             yield result.get('uid')