예제 #1
0
def find(cmd, criteria, reindex=False):
    from whoosh.qparser import MultifieldParser
    if reindex:
        _create_index(cmd.cli_ctx)

    try:
        ix = _get_index(cmd.cli_ctx)
    except ValueError:
        # got a pickle error because the index was written by a different python version
        # recreate the index and proceed
        _create_index(cmd.cli_ctx)
        ix = _get_index(cmd.cli_ctx)

    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=_get_schema()
    )

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria)
        q = qp.parse(expanded_query)

    with ix.searcher() as searcher:
        from whoosh.highlight import UppercaseFormatter, ContextFragmenter
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
예제 #2
0
def search_results(ix, search_query, fields):
    qpo = MultifieldParser(fields, schema=ix.schema, group=qparser.OrGroup)
    qpa = MultifieldParser(fields, schema=ix.schema)
    qo = qpo.parse(search_query)
    qa = qpa.parse(search_query)
    data = []
    data_index = 0

    with ix.searcher() as s:
        resultsa = s.search(qa)
        resultso = s.search(qo)
        for hit in resultsa:
            data.append(dict(**hit))
            context = str()
            for field in fields:
                if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context):
                    context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field))
            data[data_index]["context"] = context
            data_index += 1

        for hit in resultso:
            found = False
            for hita in resultsa:
                if hit["id"] == hita["id"]:
                    found = True
            if not found:
                data.append(dict(**hit))
                context = str()
                for field in fields:
                    if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context):
                        context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field))
                data[data_index]["context"] = context
                data_index += 1
    return data
예제 #3
0
파일: indexing.py 프로젝트: sfermigier/yaka
class Searcher(object):
    """
  Assigned to a Model class as ``search_query``, which enables text-querying.
  """

    def __init__(self, model_class, primary, index):
        self.model_class = model_class
        self.primary = primary
        self.index = index
        self.searcher = index.searcher()
        fields = set(index.schema._fields.keys()) - set([self.primary])
        self.parser = MultifieldParser(list(fields), index.schema)

    def __call__(self, query, limit=None):
        """API similar to SQLAlchemy's queries.
    """
        session = self.model_class.query.session

        results = self.index.searcher().search(self.parser.parse(query), limit=limit)
        keys = [x[self.primary] for x in results]
        if not keys:
            # Dummy request...
            return session.query(self.model_class).filter("uid = -1")
        else:
            primary_column = getattr(self.model_class, self.primary)
            return session.query(self.model_class).filter(primary_column.in_(keys))

    def search(self, query, limit=None):
        """New API: returns both whoosh records and SA models."""
        # TODO: highly suboptimal

        session = self.model_class.query.session
        hits = self.index.searcher().search(self.parser.parse(query), limit=limit)
        for hit in hits:
            yield (hit, session.query(self.model_class).get(hit[self.primary]))
예제 #4
0
파일: serv.py 프로젝트: el9335/QUAILS_1.0
def getdocs():
	params = dict(request.args.items())
	search_terms = params['NPS'].split(quails.DELIMITER)
	try:
		ix = index.open_dir("indexQ")
		
	except:
		return jsonify(failure="Index not found.  Ensure that index exists and tries again.")

	qp = MultifieldParser(["title","body"], schema=ix.schema)

	queries = []
	for term in search_terms:
		queries.append(qp.parse(term))

	docs = OrderedDict()
	hit_list = []
	with ix.searcher() as searcher:
		
		for query in queries:
			
			results=searcher.search(query)	
	
			for result in results: 
				hit_list.append((str(query),result['title']))

	return jsonify(results=hit_list)
예제 #5
0
파일: search.py 프로젝트: vbatoufflet/plume
def search_documents(filter):
    results = None

    # Check for existing index
    dir_path = os.path.join(DATA_DIR, 'index')

    if not os.path.exists(dir_path) or not Index.exists_in(dir_path):
        return None

    index = Index.open_dir(dir_path)

    if filter.startswith('tags:'):
        fields = ['tags']
        filter = filter[5:]
    else:
        fields = ['path', 'content']

    parser = MultifieldParser(fields, schema=index.schema)
    search_query = parser.parse(unicode(filter))

    # Try documents search
    try:
        searcher = index.searcher(closereader=False)

        return searcher.search(search_query,
            collapse=[sorting.FieldFacet('path'), sorting.FieldFacet('content')],
            collapse_order=sorting.FieldFacet('revision', reverse=True),
            sortedby=[sorting.FieldFacet('path'), sorting.FieldFacet('date', reverse=True)]
        )
    finally:
        searcher.close()

    return results
예제 #6
0
def parse(text, schema=SCHEMA):
    """
    parse(text[, schema=SCHEMA])
    
    Analisa e trata o texto em ``text`` de acordo com o ``schema``
    do índice de documentos.
     
    .. code-block:: python
    
        >>> from storyline.engine.query import parse
        >>> from storyline.engine.schema import get_schema
        >>>
        >>> SCHEMA = get_schema()
        >>> parse("Mestre", SCHEMA)
        Or([Term('title', u'mestr'), Term('content', u'mestr')])
    
    :param text: Consulta feita pelo usuário.
    :type text: str
    :param schema: Schema do índice de documentos.
    :type schema: Schema
    :returns: Query com termos e operadores.
    """
    
    try:
        from whoosh.qparser import MultifieldParser
    except ImportError:
        print "Ocorreu um erro na importação do módulo whoosh.qparser."
        
    qp = MultifieldParser(["title", "content"], schema, None)
    
    return qp.parse(text)
예제 #7
0
 def live_search(self, query):
     """live search on ngram field"""
     with self.ix.\
             searcher(weighting=scoring.BM25F(title_B=2)) as searcher:
         qp = MultifieldParser(self.live_search_field + self.search_field,
                               schema=self.ix.schema)
         q = qp.parse(query)
         results = searcher.search(q, limit=25).copy()
         res = {'estimated_length': results.estimated_length(),
                'scored_length': results.scored_length(),
                'runtime': results.runtime,
                'list': []}
         for i, r in enumerate(results):
             if 'id' in r and 'space' in r:
                 url = url_for('document.view', space=r['space'],
                               doc_id=r['id'])
             else:
                 url = None
             res['list'].append({'id': r.get('id', ''),
                                 'space': r.get('space', ''),
                                 'title': r.get('title', ''),
                                 'rank': r.rank,
                                 'url': url,
                                 'score': results.score(i)})
     return res
예제 #8
0
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None,
           orderby='-creation_date'):

    search_engine = get_search_engine('resource')
    search_result = {}

    if pagenum < 1:
        pagenum = 1

    with search_engine.searcher() as searcher:

        parser = MultifieldParser(search_engine.default_search_fields, searcher.schema)

        user_q = querytext and parser.parse(querytext) or Every()
        user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby)
        hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs)

        if querytext and hits.is_empty():

            correction_q = parser.parse(querytext)
            corrected = searcher.correct_query(correction_q, querytext)

            if corrected.query != correction_q:
                querytext = corrected.string
                search_result['corrected_q'] = querytext

                user_q, search_kwargs = build_search_kwargs(corrected.query, request, scope, staff, orderby)
                hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs)

        search_engine.prepare_search_response(search_result, hits, pagenum, maxresults)
        search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff)
        add_absolute_urls(search_result['results'], request)

    return search_result
예제 #9
0
	def search(self, query):
		""" general search function for a query string """
		
		hit_docs = []
		index_dir = "D:/bjstinfo_index"		# deprecated. we should use variable or configure file.
		if not os.path.exists(index_dir):
			print "Error: indexer doesn't exist!"
			sys.exit(1)
		ix = index.open_dir(index_dir)
		
		# For keywords query, we search multi-fields of documents as:
		# Title, Keywords, Abstract. give the query-time fieldsboost:
		# {"Title": 1.2, "Keywords": 1.1, "Abstract": 1.0}
		
		query_fields = ['Title', 'Keywords', 'Abstract']
		field_boosts = {'Title':1.2, 'Keywords':1.1, 'Abstract':1.0}
		qp = MultifieldParser(query_fields, schema=ix.schema, fieldboosts=field_boosts)
		q = qp.parse(query)
		with ix.searcher() as s:
			results = s.search(q, limit=50, terms=True)
#			my_cf = ContextFragmenter(maxchars=100, surround=30)	#custome fragmenter.
#			results.fragmenter = my_cf
#			my_score = StandarDeviationScorer(my_cf)	#custome scorer.
#			results.scorer = my_score
#			results.formatter = HtmlFormatter()
			for hit in results:
#				print hit.fields()
				hit_docs.append(hit.fields())
				
				# why just cannot implement the highlight function?
#				print hit.highlights('Abstract', top=20)
		
		return hit_docs
예제 #10
0
def search(querystring, language_code):
    ix = LanguageIndex(settings.WHOOSH_INDEX_PATH, language_code, _get_schema()).load()
    # parser = QueryParser('content', ix.schema)
    parser = MultifieldParser(['title', 'keywords', 'content'], ix.schema)  # fieldboosts={'title':5, 'keywords':4, 'content':1})
    parser.remove_plugin_class(WildcardPlugin)  # remove unused feature for better performance
    query = parser.parse(querystring)
    # print(parser, query, querystring)

    result = {
        'results': [],
    }

    with ix.searcher() as searcher:
        results = searcher.search(query)
        # print(results)
        # import pdb; pdb.set_trace()

        # collect results
        for hit in results:
            my_hit = {}
            # my_hit['pos'] = hit.pos
            # my_hit['rank'] = hit.rank
            # my_hit['docnum'] = hit.docnum
            my_hit['score'] = hit.score
            my_hit['object'] = Article.objects.get(code=hit.fields()['code'])
            #.exclude(published=False).exclude(release_date__gte=datetime.today())
            # my_hit['object']['is_visible'] = True
            result['results'].append(my_hit)
            # print(hit.pos, hit.rank, hit.docnum, hit.score, hit)

    return result
예제 #11
0
파일: blog.py 프로젝트: chrisrsantiago/muse
    def search(self):
        c.terms = request.GET.get('terms', '')
        c.results = []
        if len(c.terms) < 4:
            h.flash(
                _('Search queries must be at least 4 characters in length.'),
                'error'
            )
            redirect(url(controller='blog', action='index'))

        query = MultifieldParser(
            ['title', 'content', 'summary'],
            schema=index.schema
        ).parse(c.terms)
        results = index.searcher().search(query, limit=10)
        for result in results:
            terms = [v for k, v in query.all_terms() if k == 'content']
            url_kwargs = json.loads(result['url'])
            result['url'] = url(**url_kwargs)
            result['highlights'] = highlight(
                result['content'],
                terms,
                search.schema['content'].format.analyzer,
                ContextFragmenter(terms),
                HtmlFormatter(tagname='span', classname='highlight')
            )
            c.results.append(result)
        return render('search.tpl', slacks=True)
예제 #12
0
파일: core.py 프로젝트: sheimi/online-shop
def search_commodity():
    from shop import app
    ix = open_dir(app.config.get("INDEX_DIR"))
    searcher = ix.searcher()
    mparser = MultifieldParser(["content", "title"], schema=ix.schema)

    query_raw = request.args.get('q', '')
    if query_raw:
        query = mparser.parse(unicode(query_raw.lower()))
        results = searcher.search(query)

        result_id = []
        for result in results:
            result_id.append(int(result['id']))

        result_id = list(set(result_id))
        wq = None
        for rid in result_id:
            if not wq:
                wq = Q(id=rid)
            else:
                wq |= Q(id=rid)
        if wq:
            coms = Commodity.select().where(wq)
        else:
            coms = []
    else:
        coms = Commodity.select()
    category = int(request.args.get('c', '0'))
    if category and category != 1:
        coms = [c for c in coms if c.is_category(category)]
    return render_template('core/com_list.html', commodities=coms)
예제 #13
0
파일: index.py 프로젝트: timvieira/skid
def search(q, limit=None):
#    q = str(q)
    ix = open_dir(DIRECTORY, NAME)
    with ix.searcher() as searcher:
        qp = MultifieldParser(fieldnames=['title',
                                          'author',
                                          'tags',
                                          'notes',
                                          'text',
                                          'source',
#                                          'cached',
                                          'year'],
                              fieldboosts={'title':  7,
                                           'year':   6,
                                           'author': 10,
                                           'tags':   4,
                                           'notes':  2,
                                           'text':   1},
                              schema=ix.schema)

        # Whoosh chokes on queries with stop words, so remove them.
        q = remove_stopwords(q)

        q = qp.parse(q)
        for hit in searcher.search(q, limit=limit):
            yield hit
예제 #14
0
    def __call__(self, query, limit=None, fields=None, or_=False):
        if fields is None:
            fields = self._all_fields

        group = OrGroup if or_ else AndGroup
        parser = MultifieldParser(fields, self._index.schema, group=group)
        return self._index.searcher().search(parser.parse(query), limit=limit)
예제 #15
0
def keywords(request):
    query = request.GET.get('q', '')
    if not query:
        return render(request, 'search/keywords.html', {'page_name': 'search.keywords'})

    qtext = get_tokenized_query(query)
    print qtext

    idx_dir = os.path.join(settings.BASE_DIR, 'search/lagou_idx')
    ix = open_dir(idx_dir)
    searcher = ix.searcher()

    parser = MultifieldParser(["name", "com_name", 'city'], schema=ix.schema)
    q = parser.parse(qtext)

    plen = 100
    results = searcher.search(q, limit=plen)

    total = len(results)
    got = results.scored_length()
    numterms = 100
    if got < 10:
        numterms = 10
    elif got < 100:
        numterms = 50

    keywords = [(kw, score) for kw, score in results.key_terms("desc", docs=got, numterms=numterms)]

    return render(request, 'search/keywords.html',
                  {'page_name': 'search.keywords',
                   'query': query,
                   'total': total,
                   'got': got,
                   'keywords': keywords,
                  })
예제 #16
0
    def search(self, query, *args, **kwargs):
        parser = MultifieldParser(fieldnames=('content','title','headings','url'), 
                                                    schema=self.ix.schema, 
                                                    fieldboosts={'content':1,'title':2,'headings':3,'url':1})
        qry = parser.parse(query)
        search = self.ix.searcher()
#        with self.ix.searcher() as searcher:
        return search.search_page(qry, *args, **kwargs)
예제 #17
0
파일: base.py 프로젝트: niwinz/needlestack
    def search(self, query_string, index, parser=None, **kwargs):
        index = base._resolve_index(index)
        if parser is None:
            parser = MultifieldParser(fieldnames=index.get_searchable_fieldnames(),
                                      schema=index.get_schema())

        query = parser.parse(query_string)
        return self._search(query, index, **kwargs)
예제 #18
0
파일: viewer.py 프로젝트: kcl-ddh/digipal
def get_whoosh_parser(index):
    from whoosh.qparser import MultifieldParser, GtLtPlugin

    # TODO: only active columns
    term_fields = ['content', 'unitid']
    parser = MultifieldParser(term_fields, index.schema)
    parser.add_plugin(GtLtPlugin)
    return parser
예제 #19
0
    def search(self, term):
        if not self.index:
            self.load_index()

        parser = MultifieldParser(("body", "title", "tags"), schema=self.schema)
        query = parser.parse(term)
        results = self.searcher.search(query, limit=100)  # , sortedby="date", reverse=True)
        return results
예제 #20
0
    def search( self, trans, search_term, page, page_size, boosts ):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        tool_index_dir = os.path.join( trans.app.config.whoosh_index_dir, 'tools' )
        index_exists = whoosh.index.exists_in( tool_index_dir )
        if index_exists:
            index = whoosh.index.open_dir( tool_index_dir )
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                tool_weighting = scoring.BM25F( field_B={
                                                'name_B' : boosts.tool_name_boost,
                                                'description_B' : boosts.tool_description_boost,
                                                'help_B' : boosts.tool_help_boost,
                                                'repo_owner_username_B' : boosts.tool_repo_owner_username_boost } )
                searcher = index.searcher( weighting=tool_weighting )

                parser = MultifieldParser( [
                    'name',
                    'description',
                    'help',
                    'repo_owner_username' ], schema=tool_schema )

                user_query = parser.parse( '*' + search_term + '*' )

                try:
                    hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True )
                except ValueError:
                    raise ObjectNotFound( 'The requested page does not exist.' )

                log.debug( 'searching tools for: #' + str( search_term ) )
                log.debug( 'total hits: ' + str( len( hits ) ) )
                log.debug( 'scored hits: ' + str( hits.scored_length() ) )
                results = {}
                results[ 'total_results'] = str( len( hits ) )
                results[ 'page'] = str( page )
                results[ 'page_size'] = str( page_size )
                results[ 'hits' ] = []
                for hit in hits:
                    hit_dict = {}
                    hit_dict[ 'id' ] = hit.get( 'id' )
                    hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' )
                    hit_dict[ 'repo_name' ] = hit.get( 'repo_name' )
                    hit_dict[ 'name' ] = hit.get( 'name' )
                    hit_dict[ 'description' ] = hit.get( 'description' )
                    results[ 'hits' ].append( {'tool': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } )
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError( 'The search index file is missing.' )
예제 #21
0
def search(ix, query_string, sortedby=None, limit=10):
    mp = MultifieldParser(["title", "summary"], schema=ix.schema)
    
    s = ix.searcher()
    keywords = split_keywords(query_string)
    user_q = mp.parse(' OR '.join(keywords))
    # TODO: add query filter
    results = s.search(user_q, sortedby=sortedby, limit=limit)
    return results
예제 #22
0
파일: search.py 프로젝트: nano13/nvcli
 def search(self, search_key):
     ix = self.getIndex()
     
     parser = MultifieldParser(["book", "chapter", "verse", "verse_text"], schema=ix.schema)
     query = parser.parse(search_key)
     searcher = ix.searcher()
     result = searcher.search(query, limit=1000)
     
     return self.formatSearchResult(result)
예제 #23
0
 def search( self, query, return_attribute='id' ):
     # Change field boosts for searcher to place more weight on title, description than help.
     searcher = self.index.searcher( \
                     weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \
                                 ) )
     # Set query to search title, description, and help.
     parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema )
     results = searcher.search( parser.parse( query ) )
     return [ result[ return_attribute ] for result in results ]
예제 #24
0
def init():
    # Setting my schema ...
    schema_email = Schema(
        path=TEXT(stored=True),
        sender_email=TEXT(stored=True),
        recipient_emails=TEXT,
        date=DATETIME,
        subject=TEXT(stored=True),
        body=TEXT,
    )
    schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True))
    schemas = {"index_emails": schema_email, "index_book": schema_book}

    if not os.path.exists(index_path):
        os.mkdir(index_path)

    indexes = {}
    for ixname, schema in schemas.items():
        """
        Esta parte es mejorable, ya que sólo indexa si no existe indice. 
        No tiene en cuenta si los archivos indexados se han modificado o si 
        se han eliminado como se explica aquí:
            @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing
        """
        exists = index.exists_in(index_path, indexname=ixname)
        if not exists:
            ix = index.create_in(index_path, schema, indexname=ixname)

            # Indexing ...
            ix = index.open_dir(index_path, indexname=ixname)
            writer = ix.writer()
            if ixname == "index_emails":
                files = read_dir()
                index_emails(files, writer)
            elif ixname == "index_book":
                index_book(writer)
        else:
            ix = index.open_dir(index_path, indexname=ixname)
        indexes[ixname] = ix

    # Main routine
    while True:
        ix = indexes.get("index_emails")
        with ix.searcher() as searcher:
            input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): "))
            mparser = MultifieldParser(["subject", "body"], schema=ix.schema)
            myquery = mparser.parse(unicode(input_user))

            results = searcher.search(myquery)
            print "=================================================="
            for result in results:
                # read_file(result.get("path"))

                print ("Remitente: " + findNameBySender(indexes, result.get("sender_email")))
                print ("Asunto: " + result.get("subject"))
                print "=================================================="
예제 #25
0
def answer_query(query):
    with main_index.searcher() as searcher:
        parser = MultifieldParser(['title', 'summary'], main_index.schema, fieldboosts={'title': 5.0, 'summary': 0.2})
        parser.add_plugin(FuzzyTermPlugin())
        # tilde adds fuzzy parsing for 1 character and /1 requires the first letter to match
        query = parser.parse(unicode(query) + '~/1') 
        
        results = searcher.search(query, limit=100)
        tags = [r['tag'] for r in results]
    return tags
예제 #26
0
def search_my_archive(query_str):
    my_index = open_dir(conf.PATH_INDEX_ARCHIVE)
    with my_index.searcher() as searcher:
        mparser = MultifieldParser(['content','retweet'], schema=my_index.schema)
        query = mparser.parse(query_str)
        results = searcher.search(query)
        result_list = [entry['feed_id'] for entry in results]
        with open(conf.PATH_ARCHIVE_JSON,'r') as f:
            feeds = json.loads(f.read())
            return [feed for feed in feeds if str(feed['id']) in result_list]
예제 #27
0
 def term_search(self, query):
     terms = []
     if query.get('term'):
         parser = MultifieldParser(self.term_fields, schema=self.index.schema)
         terms.append(parser.parse(unicode(query.pop('term')[0])))
     for key in query.keys():
         terms.append(Or([ Term(key, unicode(t)) for t in query.pop(key) ]))
     with self.searcher() as searcher:
         for entry in searcher.search(And(terms), limit=None):
             yield entry.fields()
예제 #28
0
    def lookup(self, term, fuzzy=False, limit=None):
        term = term.strip()
        term = term.lower()

        if limit:
            limit = limit
        else:
            limit = self.RESULTS_LIMIT

        fields = (
            'indice',
            'indice_game',
            'name',
            'name_jp',
            'game',
            'version',
            'classification',
            'element',
            'code',
            'size',
            'damage_min',
            'damage_max',
            'recovery',
            'rarity'
        )
        if fuzzy:
            parser = MultifieldParser(
                fields,
                schema=self.index.schema,
                termclass=FuzzyTerm
            )
        else:
            parser = MultifieldParser(fields, schema=self.index.schema)

        operators = OperatorsPlugin(
            And="&",
            Or="\\|",
            AndNot="&!",
            AndMaybe="&~",
            Not="\\-"
        )
        parser.replace_plugin(operators)
        query = parser.parse(term)
        results = []
        try:
            searcher = self.index.searcher()
            results = searcher.search(query, limit=limit)

            if not results and not fuzzy:
                # Try a Fuzzy Search.
                return self.lookup(term, fuzzy=True, limit=self.FUZZY_LIMIT)
        except IndexError:
            pass
            
        return results
 def render_GET(self, request):
     section_path = '/'.join(request.postpath).strip('/')
     if not section_path:
         defer.returnValue(json.dumps({'status': 'error', 'message': 'unable to search root'}))
     
     section_name = request.postpath[0]
     
     ix = self._get_index(section_path)
     if not ix:
         defer.returnValue(json.dumps({'status': 'error', 'message': 'unknown index for %s' % section_path}))
     
     schema_settings = self._get_schema_settings(section_path)
     schema = schema_settings['schema']
     
     if 'schema' in request.args:
         if section_path in self.currently_indexing:
             yield self.currently_indexing[section_path]
         
         field_choices = schema_settings.get('field_choices', {})
         fields = {}
         
         for field in set(schema.names()):
             if isinstance(schema[field], KEYWORD) and field in field_choices:
                 fields[field] = sorted(x for x in field_choices[field] if x)
         
         defer.returnValue(json.dumps({'status': 'ok', 'schema': fields, 'type': schema_settings['type']}))
     
     if 'q' not in request.args:
         defer.returnValue(json.dumps({'status': 'error', 'message': 'missing q argument in url'}))
     q = unicode(request.args['q'][0])
     
     parser = MultifieldParser(['search_field'], schema=schema)
     parser.add_plugin(GtLtPlugin())
     query = parser.parse(q)
     
     with ix.searcher() as searcher:
         results = yield threads.deferToThread(searcher.search, query, limit=10000)
         #corrected = searcher.correct_query(query, q) # jesus this is bad for titles
         results = [x['linkitem'] for x in results]
     
     section = settings.SECTIONS[section_name]
     rootfolder = RootFolder(parent_path='', name='Search result for: %s' % q, urlname=self.name, date=0)
     rootfolder['content_type'] = section.levels[0].content_type
     
     for result in results:
         rootfolder.add_item(result)
     
     #if corrected.query != query:
     #    retval['suggestion'] = {
     #        'rel': 'suggested_query',
     #        'href': urlparse.urljoin(settings.BASE_URL, '/search/%s' % urllib.quote(section_path)) + '?%s' % urllib.urlencode({'q': corrected.string}),
     #        'suggested_query': corrected.string,
     #    }
     
     defer.returnValue(rootfolder.serialize())
예제 #30
0
 def _create_parser(self, context):
     parser = MultifieldParser(
         self.field_boosts.keys(),
         WhooshBackend.SCHEMA,
         fieldboosts=self.field_boosts
     )
     parser.add_plugin(
         MetaKeywordPlugin(meta_keyword_parsers=self.meta_keyword_parsers,
                           context=context)
     )
     return parser
예제 #31
0
def searchterm():
    search_term=request.form['search_term']
    ix = open_dir("indexdir")
    with ix.searcher() as searcher:
        query = MultifieldParser(["title","author","secondauthor","publication"], ix.schema).parse(search_term)
        results = searcher.search(query, limit=20)
        result_dict = {}
        for result in results:
            headline_jpeg = "imgs/publications/pub" + result['publication'] + "/" + result['path'].split("/")[-1] + ".jpg"
            result_dict[result['path']] = [result['title'],result['author'],result['publication'], headline_jpeg]
            
        
    return render_template('form_action.html', search_terms=result_dict, orig_search = search_term)
예제 #32
0
def preform_whoosh_search(query,
                          ix=None,
                          fields=None,
                          page=None,
                          per_page=None,
                          sortedby=[],
                          reverse=True,
                          **kwargs):
    """
        Query the indexed, looking for a match in the specified fields.
        Results a tuple of results and an open searcher object.
        """

    per_page = per_page or settings.SEARCH_RESULTS_PER_PAGE
    fields = fields or [
        'tags', 'title', 'author', 'author_uid', 'content', 'author_handle'
    ]
    ix = ix or init_index()
    searcher = ix.searcher()

    # Splits the query into words and applies
    # and OR filter, eg. 'foo bar' == 'foo OR bar'
    orgroup = OrGroup

    parser = MultifieldParser(fieldnames=fields,
                              schema=ix.schema,
                              group=orgroup).parse(query)
    if page:
        # Return a pagenated version of the results.
        results = searcher.search_page(parser,
                                       pagenum=page,
                                       pagelen=per_page,
                                       sortedby=sortedby,
                                       reverse=reverse,
                                       terms=True)
        results.results.fragmenter.maxchars = 100
        # Show more context before and after
        results.results.fragmenter.surround = 100
    else:
        results = searcher.search(parser,
                                  limit=settings.SEARCH_LIMIT,
                                  sortedby=sortedby,
                                  reverse=reverse,
                                  terms=True)
        # Allow larger fragments
        results.fragmenter.maxchars = 100
        results.fragmenter.surround = 100

    #logger.info("Preformed index search")

    return results
예제 #33
0
def search():
    print(request.args)
    search = request.args.get('search')
    author = request.args.get('author')
    category = request.args.get('category')
    page = int(request.args.get(
        'page')) if not request.args.get('page') is None else 1
    print(search)

    if search is None and author is None and category is None:
        myquery = Every()
    else:
        if search is None:
            if not author is None:
                myquery = Term('author', author)
                if not category is None:
                    myquery = myquery & Term('category', category)
            else:
                myquery = Term('category', category)
        else:
            myquery = MultifieldParser(["title", "post_content"],
                                       ix.schema,
                                       plugins=[FuzzyTermPlugin()
                                                ]).parse(search)

            if not author is None:
                myquery = myquery & Term('author', author)

            if not category is None:
                myquery = myquery & Term('category', category)

    with ix.searcher() as searcher:
        results = searcher.search_page(myquery,
                                       page,
                                       pagelen=25,
                                       sortedby="date",
                                       reverse=True)
        print(results.is_last_page())
        results_json = json.dumps(
            {
                "results": [dict(i) for i in results],
                "page": page,
                "total_results": results.total
            },
            default=str)

    resp = Response(response=results_json,
                    status=200,
                    mimetype="application/json")

    return resp
예제 #34
0
def search_whoosh_index(query, offset=0, limit=10, *args, **kwargs):
    ix = get_whoosh_index()
    parser = MultifieldParser(['content', 'authors', 'tags', 'title', 'abstract'], ix.schema)
    # user query
    q = parser.parse(query)
    
    if not query:
      q = Every()
      print 'arch'

    allow_q = And([Term(key, value) for key, value in kwargs.iteritems()])
    # parse remaining args
    res = []
    count = 0
    offset = int(offset)
    limit = int(limit)
    right = offset + limit
    # restrict_q = Or([Term("path", u'%s' % d.id) for d in qs])
    #print 'query', q, allow_q, kwargs
    with ix.searcher() as searcher:
      # From WHOOSH documentation: 
      # > Currently, searching for page 100 with pagelen of 10 takes the same amount of time as using Searcher.search() 
      #   to find the first 1000 results
      results = searcher.search(q, filter=allow_q, limit=right, terms=True)
      count = len(results)

      
      for hit in list(results)[offset:]:
        res.append({
          # 'title': hit['title'],
          'short_url': hit['path'],
          'highlights': hit.highlights("content", top=5)
        })
    # @todo filter by empty highlight strings
    return {
      'results': res,
      'count': count
    }
예제 #35
0
def search(page):
    search = request.args['q']
    storage = FileStorage(conf.INDEX_DIR)
    index = storage.open_index(indexname=conf.INDEX_NAME)
    qp = MultifieldParser(['title', 'text', 'tags'], schema=index.schema)
    q = qp.parse(search)
    results = []
    with index.searcher() as searcher:
        results = searcher.search_page(q, page, pagelen=conf.PAGE_SIZE)
        # Get real posts
        post_ids = ",".join(["'%s'" % p['post_id'] for p in results
                             if not p['post_id'].startswith('static-')])
        if post_ids:
            ghost = get_voyage_connection()
            with ghost.cursor() as ghost_cur:
                query = "SELECT title, feature_image, html, slug FROM posts WHERE id IN (%s) ORDER BY published_at DESC" % post_ids
                ghost_cur.execute(query)
                posts = [{'type': "post",
                          'title': i[0],
                          'image': i[1],
                          'excerpt': excerpt(i[2]),
                          'url': "/blog/" + i[3]} for i in ghost_cur.fetchall()]
            ghost.close()
        else:
            posts = []
        # Get static pages
        for p in results:
            if p['post_id'].startswith('static-'):
                page_key = p['post_id'].replace("static-", "")
                page = conf.STATIC_TPL[page_key]
                with open('templates/' + page['tpl_file'], "r") as p:
                    page_text = p.read()
                posts.append({'type': 'static-page',
                              'title': page['title'],
                              'excerpt': excerpt(page_text),
                              'image': page.get('image'),
                              'url': page['url']})
    return render_template("search.html", posts=posts, search=search)
예제 #36
0
 def search(self, query, page=1, pagelen=20):
     """Return a sorted list of results.
     pagelen specifies the number of hits per page.
     page specifies the page of results to return (first page is 1)
     Set pagelen = None or 0 to retrieve all results.
     """
     query = unicode(query)  # Must be unicode
     population_sort_facet = sorting.FieldFacet("population", reverse=True)
     ix = whoosh_open_dir_32_or_64(self.index_dir)
     with ix.searcher() as searcher:
         # query = QueryParser("ngram_name", ix.schema).parse(query)
         mparser = MultifieldParser(
             ["ngram_name", "admin1_code", "country_code"],
             schema=ix.schema)
         query = mparser.parse(query)
         if pagelen is not None and pagelen != 0:
             try:
                 results = searcher.search_page(query,
                                                page,
                                                pagelen=pagelen)
             except ValueError, e:  # Invalid page number
                 results = []
         else:
예제 #37
0
def perform_search(query="*", page=1):
    if query.strip() in {"", "*"}:
        return full(page=page)

    query = query.replace("v", "u").replace("V", "u").replace("j", "i").replace("J", "I")
    qp = MultifieldParser(["lemma", "Description", "Variation"], schema=search_index.schema)
    q = qp.parse(query)

    with search_index.searcher() as s:
        results = s.search_page(q, pagenum=page, pagelen=PAGELEN)

        out = {
            "pages": {
                "last": results.pagecount,
                "current": results.pagenum,
                "is_last": results.is_last_page()
            },
            "results": [
                dict(res)
                for res in results
            ]
        }
    return out
예제 #38
0
def search_index(query_string, page):
    """
    Search index based on the query
    :param query_string: query
    :param page: requested page
    :return: tuple: results total, results on page, ids of videos on page
    """
    index = open_index()
    with index.searcher() as searcher:
        query = MultifieldParser(settings.INDEX_SEARCH_FIELDS,
                                 index.schema,
                                 group=OrGroup).parse(query_string)
        results = searcher.search_page(query, pagenum=page)
        return [hit['id'] for hit in results], results.total, results.pagecount
예제 #39
0
    def search(self, query):
        """
        Let's send a query to the shelf. The query is a dict of keys, with associated values.
        """
        qp = MultifieldParser(query.keys(), schema=self.index.schema)

        # Let's assemble the query
        search_terms = ''
        for k in query:
            search_terms += '%s:%s ' % (k, query[k])

        # We need to parse the query
        if u'*' in [query[k] for k in query]:
            # We have a query asking for every doc
            q = Every()
        else:
            q = qp.parse(search_terms)

        # And now, we search
        results = self.index.searcher().search(q, limit=None)

        # We just have to return the results
        return results
예제 #40
0
def searchPost(keyword, ForumId=None):
    """
        Search posts with keyword
        [IN]: keyword
        [OUT]: list of post IDs
    """
    # q = Or([Term("Title", unicode(keyword)), Term("Body", unicode(keyword))])
    parser = MultifieldParser(["Title", "Body", "Tags"], schema=Post_Schema)
    s_parser = parser.parse("Title OR beta gamma")
    words=""
    for k in keyword:
        words += k+" and "

    q = parser.parse(words[0:-5])
    allow_q = Term("ForumId", str(ForumId)) if ForumId else None
    print q
    ix = open_dir(INDEX_DIR, indexname=POST_INDEX)
    results = []
    with ix.searcher() as searcher:
        hits = searcher.search(q, filter=allow_q, limit=None)
        for hit in hits:
            results.append((hit["ForumId"], hit["Id"]))
    return results
예제 #41
0
def search(indexer, searchTerm, searchColumns):
    with indexer.searcher() as searcher:
        words = searchColumns
        query = MultifieldParser(words,
                                 schema=indexer.schema).parse(searchTerm)
        results = searcher.search(query, limit=None)
        print("Length of results: " + str(len(results)))
        result = [[], [], [], []]
        for line in results:
            result[0].append(line['Access_Name'])
            result[1].append(line['County'])
            result[2].append(line['Type'])
            result[3].append(line['Location'])
        return result
예제 #42
0
파일: query.py 프로젝트: ideiaglobal/kerko
def build_keywords_query(keywords):
    """
    Build parsers for a query.

    :param MultiDict keywords: The search texts keyed by scope key. If empty,
        the query will match every documents.
    """
    queries = []
    if keywords:
        composer = current_app.config['KERKO_COMPOSER']
        text_plugins = [
            plugins.PhrasePlugin(),
            plugins.GroupPlugin(),
            plugins.OperatorsPlugin(
                And=r"(?<=\s)" + re.escape(gettext("AND")) + r"(?=\s)",
                Or=r"(?<=\s)" + re.escape(gettext("OR")) + r"(?=\s)",
                Not=r"(^|(?<=(\s|[()])))" + re.escape(gettext("NOT")) +
                r"(?=\s)",
                AndNot=None,
                AndMaybe=None,
                Require=None),
            plugins.BoostPlugin(),
        ]
        for key, value in keywords.items(multi=True):
            fields = [
                spec.key for spec in composer.fields.values()
                if key in spec.scopes
            ]
            if not fields:
                raise KeyError  # No known field for that scope key.
            parser = MultifieldParser(fields,
                                      schema=composer.schema,
                                      plugins=text_plugins)
            queries.append(parser.parse(value))
    else:
        queries.append(Every())
    return And(queries)
예제 #43
0
    def post(self):
        song_detail = self.get_argument('query')

        ix = index.open_dir(indexdir_path)
        facet = sorting.FieldFacet("comment_num", reverse=True)
        searcher = ix.searcher()

        qp = MultifieldParser(["artist_name", 'lyrics', 'music_name'],
                              schema=ix.schema)
        q = qp.parse(song_detail)
        results = searcher.search(q, sortedby=facet)

        for r in results:

            print(r.fields())

        song_num = len(results)
        if song_num == 1:
            song_num = '0'

        self.render('section4.html',
                    results=results,
                    song_detail=song_detail,
                    song_num=song_num)
예제 #44
0
def searchWhoosh(request):
    ix = open_dir("whooshdir")
    qp = MultifieldParser(['brand', 'name', 'category', 'price'],
                          schema=ix.schema,
                          group=OrGroup)

    q = qp.parse(request.GET.get('query'))

    with ix.searcher() as searcher:
        #Gets the top X results for the query where X=query_limit
        results = searcher.search(q, limit=int(request.GET.get('query_limit')))
        print("{} products".format(len(results)))
        results_json = []
        for r in results:
            #product = r['brand']+" - "+r['name']+" - "+r['category']+" - "+str(r['price'])+"€"
            product = [
                r['sku'], r['image'], r['brand'], r['name'], r['category'],
                r['price']
            ]
            results_json.append(product)
        print('--------------END SEARCH--------------')
    print(results_json)
    mimetype = 'application/json'
    return HttpResponse(json.dumps(results_json), mimetype)
예제 #45
0
파일: whooshQuery.py 프로젝트: KhanhCon/FYP
def search(qstring, index_folder="index_fullname"):
    index = whoosh.index.open_dir(os.path.join(dirname, index_folder))
    schema = index.schema
    qp = MultifieldParser(["fullname"], schema=schema)
    q = qp.parse(unicode(qstring))
    with index.searcher() as searcher:
        searchResult = searcher.search(q, limit=20)
        # result = {r["fullname"] for r in searchResult}
        ids = [{
            "rank": index_rank,
            "id": r["id"]
        } for index_rank, r in enumerate(searchResult)]
        # ids = {r for r in searchResult}
        corrector = searcher.corrector("fullname")
        suggestions = []
        if len(ids) == 0:
            suggestions = corrector.suggest(qstring, limit=6)
        # suggestionResults = {s["fullname"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)}
        # result = result.union(suggestionResults)
        # ids_suggestion = [s["id"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)]
        # ids_suggestion = {s for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)}
        # ids = ids+ids_suggestion

    return {"ids": list(ids), "suggestions": suggestions}
예제 #46
0
def simple_query(query, or_group=False, page = 0, n = 10):
    """
    Performs a simple keyword query using `query` through whoosh. This, by default, will look at all 3 major text based fields. (name, rules text, flavor text.)
    :param str query: the input query
    :param bool or_group: specifies whetehr to use a AND grouping or an OR grouping.
    :param int page: the page to return.
    :param int n: how many results should be in the return set.
    :return: Exact class TBD, will provide way to iterate over the page's worth of results.
    """
    ix = get_whoosh_index()

    # fix `page` and `n` (they may be string versions of ints)
    page = int(page)
    n = int(n)

    # parse against the main text fields (note: subtypes is here to help aid in "tribe" searches, and boost planeswalker results)
    qparser = MultifieldParser(['rules_text', 'name', 'flavor_text'], ix.schema, group = OrGroup if or_group else AndGroup)

    query = qparser.parse(query.lower()) # all text fields are lowered in whoosh, so do same here

    with ix.searcher() as searcher:
        # Quick note: whoosh expects pages to start with 1, so we'll take page+1
        results = searcher.search_page(query, page+1, pagelen=n)
        return [x['data_obj'] for x in results]
def search_clicked():
    fc_btn.grid_remove()
    global current_page
    if len(displayed_results) != 0:
        page_clear(True)
    q = MultifieldParser(['title', 'body'], schema=ix.schema)
    search_keyword = txt.get().lower()
    search_keyword = preProcess(search_keyword)
    r = q.parse(search_keyword)
    with ix.searcher(weighting=wBM25) as searcher:
        results = searcher.search(r, limit=30)
        for r in results:
            url = str(r['url'])
            label_num = Label(window, text='0.')
            button_url = Button(window,
                                text=r['url'][30:],
                                command=callback(url))
            displayed_results.append((label_num, button_url))
    current_page = 0
    select_page()

    if isMispelled(txt.get()):
        fc_btn.grid(columnspan=10, row=2, sticky='ew')
        fc_btn.configure(text='Forse cercavi: ' + correct(txt.get()))
예제 #48
0
def search_frequency(**kwargs):
    ix = open_dir(INDEX_PATH_BASIC)

    with ix.searcher(weighting=Frequency) as searcher:
        parser = MultifieldParser(['question', 'answer'], ix.schema)

        # myquery = parser.parse(kwargs['query'])
        arr_term = []
        for word in kwargs['query'].split(' '):
            arr_term.append(Or([Term('question', word), Term('answer', word)]))
        myquery = And(arr_term)
        results = searcher.search(myquery, limit=None)
        for result in results:

            yield result['id']
예제 #49
0
def search_in_index(search_kw, index):
    '''
    search_kw: ce que rentre l'utilisateur dans la barre de recherche
    index: l'index ouvert (objet ix dans le code qui précède)
    
    La fonction renvoie un dictionnaire avec pour clefs:
        - results: une liste contenant des dictionnaires. Chaque dictionnaire 
        correspond à un résultat de recherche. Le premier élément de la liste 
        est le meilleur résultat. Les dictionnaires on deux clefs: 'title' avec 
        le titre du doc, et 'path' avec le chemin (vers le doc texte, pour le moment)
        - suggestions: dictionnaire de suggestion qui propose une éventuelle 
        correction pour chaque mot entré par l'utilisateur. A voir comment on 
        mélange les suggestions des différents mots pour fournir des suggestions 
        complètes
    '''
    #on utilise un MultifieldParser pour rechercher à la fois dans le titre et dans le contenu
    parser = MultifieldParser(["content", "title"], index.schema)
    #on rajoute un plugin de FuzzyMatching pour pouvoir chercher au delà des mots exacts
    parser.add_plugin(FuzzyTermPlugin())
    searcher = index.searcher()
    #on transforme la requête utilisateur pour la mettre en format compréhensible par le plugin de FuzzyMatching
    to_parse = ' '.join([i + '~1' for i in search_kw.split(' ')])
    myquery = parser.parse(to_parse)
    #on récupère les résultats pour pouvoir fermer le searcher  par la suite
    r = searcher.search(myquery)
    results = []
    for res in r:
        results.append({'title': res['title'], 'path': res['path']})
    #on set-up le correcteur et on stock ce qu'il propose pour chaque mot tapé
    corrector = searcher.corrector("content")
    suggestions = {}
    for kw in search_kw.split(' '):
        suggestions[kw] = corrector.suggest(kw)
    #on ferme le seacher
    searcher.close()
    return {'results': results, 'suggestions': suggestions}
예제 #50
0
def searchh(indexer, searchTerm):
	with indexer.searcher() as searcher:
		words = ['Access_Name']
		query = MultifieldParser(words, schema=indexer.schema).parse(searchTerm)
		results = searcher.search(query)
		print("\nLength of results: " + str(len(results)) + '\n')
		result = []
		scm = ['Access_Name', 'URL', 'imgURL', 'County', 'Type', 'Location', 'Access_Type', 'Path_to_Beach', 'Managed_by']
		scm = scm + ['Parking', 'Fee', 'Bathrooms', 'Handicap_Access', 'Running_Water', 'Showers', 'Camp_Sites', 'Stairs_to_Beach', 'Boat_Ramps', 'Tidepooling']
		scm = scm + ['Surfing', 'Hiking', 'Bicycling', 'Horseback_Riding', 'Road_Vehicle_Access', 'Whale_Watching']
		if(len(results)>0):
			for x in scm:
				result.append(results[0][x])

		return result
예제 #51
0
    def search(self, words):
        ix = open_dir(self.indexdir)
        searcher = ix.searcher()

        # And[Or[Term(title: 'aa'), Term(content:'aa')], Or[Term(title: 'bb'), Term(content:'bb')]]
        mparser = MultifieldParser(self.fieldNames, schema=self.schema)
        q = mparser.parse(" and ".join(words))

        # qs = []
        # # 搜索的关键词都必须得出现,出现在哪个字段都行
        # for w in words:
        #     qs.append(Or([Term(fn, w) for fn in self.fieldNames]))
        # q = And(qs)

        # 搜索第1页,每页20条
        results = searcher.search_page(q, 1, pagelen=3)
        # results = searcher.find("title", u"文档")

        # 检索出来的第一个结果,数据格式为dict{'title':.., 'content':...}
        # firstdoc = results[0].fields()
        #
        # # python2中,需要使用json来打印包含unicode的dict内容
        # jsondoc = json.dumps(firstdoc, ensure_ascii=False)
        #
        # print(jsondoc)  # 打印出检索出的文档全部内容
        # print(results[0].highlights("title"))  # 高亮标题中的检索词
        # print(results[0].score)  # bm25分数

        for r in results:
            doc = r.fields()
            print(doc.values())
            print("bm25分数: %f" % r.score)
            # for field in self.fieldNames:
            #     print(r.highlights(field))

        print("总共记录数: %i" % len(results))
예제 #52
0
def classifyResults(request):
    timeS = time.time()
    request.session['searchQuery'] = ""
    resume = request.GET.get('resumeInput', None)
    indexes = resumeSearch(resume)

    ix = open_dir(settings.WHOOSH_INDEX)
    parser = MultifieldParser([
        "jobtitle", "company", "city", "state", "country", "source", "date",
        "JD", "url", "latitude", "longitude", "relative_time"
    ],
                              ix.schema,
                              group=qparser.OrGroup)

    queryInput = ""
    for i in indexes:
        queryInput = queryInput + "job_id:" + i + " OR "

    # queryExclude = parser.parse("<b>")
    query = parser.parse(queryInput)
    searcher = ix.searcher()
    # results = searcher.search(query, filter=filt)
    results = searcher.search(query)

    numResults = len(results)

    timeE = time.time()
    timeLapse = timeE - timeS
    timeLapse = float("{0:.3f}".format(timeLapse))
    # print (timeLapse)

    return render(request, 'classifyResults.html', {
        "results": results,
        'time': timeLapse,
        'num': numResults
    })
예제 #53
0
def search_video(keyword, page, pagelen):
    with ix_video.searcher() as searcher:
        parser = MultifieldParser(['title', 'pinyin_title'],
                                  schema=ix_video.schema)
        q = parser.parse(keyword)
        results = searcher.search_page(q,
                                       pagenum=int(page),
                                       pagelen=int(pagelen))
        video_list = []
        for hit in results:
            field = hit.fields()
            info = {}
            info['gcid'] = field.get('gcid', '')
            info['movieid'] = field.get('movieid')
            info['uid'] = field.get('uid')
            info['cover_height'] = field.get('cover_height')
            info['cover_width'] = field.get('cover_width')
            info['cover_url'] = field.get('cover_url')
            info['duration'] = field.get('duration')
            info['title'] = field.get('title')
            info['duration'] = field.get('duration')
            video_list.append(info)
            #info['pic'] = field.get('pic')
        return constants.CODE_OK, video_list, results.total
예제 #54
0
def query(q='', fields=['content'], **kwargs):
    """
    Query the indexed, looking for a match in the specified fields.
    Results a tuple of results and an open searcher object.
    """

    # Do not preform any queries if the index does not exist.
    if not index_exists():
        return []

    ix = init_index()
    searcher = ix.searcher()

    profile_score = FieldFacet("author_score", reverse=True)
    post_type = FieldFacet("type")
    thread = FieldFacet('thread_votecount')
    content_length = FieldFacet("content_length", reverse=True)
    rank = FieldFacet("rank", reverse=True)
    default = ScoreFacet()

    # Splits the query into words and applies
    # and OR filter, eg. 'foo bar' == 'foo OR bar'
    orgroup = OrGroup

    # Sort by: toplevel, match score, author reputation, post rank.
    # sort_by = [post_type,  profile_score, rank, default]

    # sort_by = [post_type]

    # sort_by = [profile_score]

    # sort_by = [rank]

    # sort_by = [thread]

    sort_by = [post_type, default, content_length]

    # sort_by = [content_length]

    parser = MultifieldParser(fieldnames=fields, schema=ix.schema, group=orgroup).parse(q)
    results = searcher.search(parser, sortedby=sort_by, limit=settings.SEARCH_LIMIT, terms=True, **kwargs)
    # Allow larger fragments
    results.fragmenter.maxchars = 100
    # results.fragmenter.charlimit = None
    # Show more context before and after
    results.fragmenter.surround = 100

    return results
예제 #55
0
    def search(self, text, table, limit=10):
        """ Searches the index for anything containing the text. """

        schema = self.__get_schema()
        index = self.__get_index(schema, False)

        # Here we use TF-IDF because that is what our mongo search will use.

        with index.searcher(weighting=scoring.TF_IDF()) as searcher:
            names = schema.names()
            names.remove('prop6')

            query = MultifieldParser(names, schema=index.schema).parse(text)
            results = searcher.search(query, limit=None)

            return self.__build_results(results, table, limit)
예제 #56
0
class WhooshWrap():
    '''
        Wrapper class to make Whoosh API a little simpler
        Initialize by pointing to an existing Whoosh index and specifying searchable fields, Max Results and Timeout
        Query by running self.doSearch, providing query string, and timeout
        Results of the last search are stored in the object as Whoosh results object (requires open index to access) and returned as a traditional python dictionary
    '''
    def __init__(self,MSID_index_dir, Searchable,MaxResults=10,Timeout = 0.5):
        ''' Initializes the wrapper object with ijdex reference and preferences
            parameter MSID_index_dir        = (string) Existing Whoosh Index directory
            parameter Searchable            = (string) List of fieldnames of the index to search
            parameter MaxResults       = (numeric) Maximum # of results to return
            parameter Timeout       = (numeric) Maximum # of seconds to wait before ending search
        
        '''
        self.ix = index.open_dir(MSID_index_dir)                         #  
        self.qp = MultifieldParser(Searchable, schema=self.ix.schema)    # Search all the specified fields
        #self.qp =  QueryParser(Searchable[0], schema=self.ix.schema)    # Search ONLY the first field
        #self.s = self.ix.searcher(weighting = scoring.Frequency)        # Simple Scorer
        self.s = self.ix.searcher(weighting = scoring.BM25F)         # Fancy Scorer
        c = self.s.collector(limit=MaxResults)                # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long...
        self.c = TimeLimitCollector(c,Timeout)               
        self.Searchable = Searchable
        self.LastResults = None
        
    def doSearch(self,qstring,ReturnFields):
        ''' Performs a search on the index with the provided query and returns a Dict of results
            parameter qstring       = (string) Search key
            parameter ReturnFields  = (list of strings) List of fieldnames to include in return results.  NOTE, may be different than Searchable, but fields must exist in index
            returnval result_fields = dict of result strings : lists per field, i.e. 
                                    = result_dict = {'Return Fields 1' : [ list of result strings ], 'Return Fields 2' : [ list of result strings ]....}
        '''
        q = self.qp.parse(qstring)          # build query with event-provided search key        
        try:
            self.s.search_with_collector(q,self.c)
        except:
            print("TIMEOUT!")                       # DEBUG out put to console if we're timing out a lot  
        results = self.c.results()                       # If we do get a timeout, still return whatever we've got, i.e. partial results    
        self.LastResults = results                  #
        ResultsDict ={}
        for field in ReturnFields:
            ResultsDict[field] = []
            for res in results:
                ResultsDict[field].append(res[field]) # should check that field is in results
        return ResultsDict
            
예제 #57
0
    def mostrar_lista(event):

        lb.delete(0, END)

        ix = open_dir(dir_index)
        with ix.searcher() as searcher:
            query = MultifieldParser(["titulo", "descripcion"],
                                     ix.schema).parse(
                                         str(en_tituloDescripcion.get()))
            results = searcher.search(query)
            for r in results:
                lb.insert(END, "Antetitulo: " + r['antetitulo'])
                lb.insert(END, "Titulo: " + r['titulo'])
                lb.insert(
                    END, "Fecha publicacion: " +
                    r['fechaPublicacion'].strftime('%Y/%m/%d'))
                lb.insert(END, "")
예제 #58
0
def buscar_tarifas_movil(str):
    query = input("Introduzca una palabra de busqueda: ")
    ix = open_dir(dir_in, indexname="indice_tarifasMovil")

    with ix.searcher() as searcher:
        myquery = MultifieldParser([
            "internet_movil", "coste_mensual", "minutos", "promociones", "tipo"
        ], ix.schema).parse(query)
        results = searcher.search(myquery)
        for r in results:
            print("Nombre: " + r['nombre'])
            print("Minutos: " + r['minutos'])
            print("Internet Movil: " + r['internet_movil'])
            print("Promociones: " + r['promociones'])
            print("Coste Mensual: " + r['coste_mensual'])
            print("Tipo: " + r['tipo'])
            print("")
예제 #59
0
    def search(self, ix=None):
        if ix is None:
            ix = open_dir(self.dir_name)
            self.search(ix)

        self.searcher = ix.searcher()
        fields = []
        qs = ''

        if self.Index is True:
            if self.word is not None and len(self.word) > 0:
                qs += u' index_letter:({0})'.format(self.word)
                fields.append("index_letter")
        else:
            qs += u' verb_form:({0})'.format(self._word)

        self.query = MultifieldParser(fields, ix.schema).parse(qs)
예제 #60
0
def preform_whoosh_search(query, fields=None, **kwargs):
    """
        Query the indexed, looking for a match in the specified fields.
        Results a tuple of results and an open searcher object.
        """

    # Do not preform search if the index does not exist.
    if not index_exists() or len(query) < settings.SEARCH_CHAR_MIN:
        return []
    fields = fields or ['content', 'title']
    ix = init_index()
    searcher = ix.searcher()

    # profile_score = FieldFacet("author_score", reverse=True)
    # post_type = FieldFacet("type")
    # thread = FieldFacet('thread_votecount')
    # # content_length = FieldFacet("content_length", reverse=True)
    # rank = FieldFacet("rank", reverse=True)
    # default = ScoreFacet()

    # Splits the query into words and applies
    # and OR filter, eg. 'foo bar' == 'foo OR bar'
    orgroup = OrGroup

    # sort_by = sort_by or [post_type, rank, thread, default, profile_score]
    # sort_by = [lastedit_date]

    parser = MultifieldParser(fieldnames=fields,
                              schema=ix.schema,
                              group=orgroup).parse(query)
    results = searcher.search(parser,
                              limit=settings.SEARCH_LIMIT,
                              terms=True,
                              **kwargs)
    # Allow larger fragments
    results.fragmenter.maxchars = 100
    # results.fragmenter.charlimit = None
    # Show more context before and after
    results.fragmenter.surround = 100

    # Sort results by last edit date.
    results = sorted(results, key=lambda x: x['lastedit_date'], reverse=True)

    logger.info("Preformed index search")

    return results