def query_parser(self, default_fields, idx_name=LATEST_REVS): """ Build a query parser for a list of default fields. """ schema = self.schemas[idx_name] if len(default_fields) > 1: qp = MultifieldParser(default_fields, schema=schema) elif len(default_fields) == 1: qp = QueryParser(default_fields[0], schema=schema) else: raise ValueError("default_fields list must at least contain one field name") qp.add_plugin(RegexPlugin()) def userid_pseudo_field_factory(fieldname): """generate a translator function, that searches for the userid in the given fieldname when provided with the username """ def userid_pseudo_field(node): username = node.text users = user.search_users(**{NAME_EXACT: username}) if users: userid = users[0].meta[ITEMID] node = WordNode(userid) node.set_fieldname(fieldname) return node return node return userid_pseudo_field qp.add_plugin(PseudoFieldPlugin(dict( # username:JoeDoe searches for revisions modified by JoeDoe username=userid_pseudo_field_factory(USERID), # assigned:JoeDoe searches for tickets assigned to JoeDoe assigned=userid_pseudo_field_factory(ASSIGNED_TO), ))) return qp
def query_search(indexdir, queries, n=10, function='BM25F'): ix = index.open_dir(indexdir) search_fields = ['resname', 'categories', 'address', 'city', 'state'] # search fields og = qparser.OrGroup.factory(0.9) qp = MultifieldParser(search_fields, ix.schema, termclass=query.Variations, group=og) qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(queries) result_index = [] if function == 'BM25F': with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) if function == 'TF_IDF': with ix.searcher(weighting=scoring.TF_IDF()) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) return result_index
def searchPage(keyword, curPage=1, pagelen=10): with ix.searcher() as searcher: # res=dict() # parser = QueryParser('content', schema=ix.schema) hf = HtmlFormatter(tagname="code", classname="match", termclass="term") fragmenter = WholeFragmenter(charlimit=None) parser = MultifieldParser(["title", "content", 'createAt'], schema=ix.schema) parser.add_plugin(DateParserPlugin()) q = parser.parse(keyword) page = searcher.search_page(q, curPage, pagelen) #,terms=True page.results.fragmenter = fragmenter #page.results.fragmenter.charlimit=None page.results.formatter = hf # terms = page.results.matched_terms() # key=[ e for e in terms ][0][1].decode('UTF-8') resPage = dict(pagenum=page.pagenum, pagecount=page.pagecount, total=page.total, posts=[]) for hint in page: tmp = dict() tmp['title'] = hint.highlights("title", minscore=0) tmp['author'] = hint["author"] tmp['location'] = hint["location"].replace(os.sep, '/').replace('//', '/') if tmp['location'].startswith('/'): tmp['location'] = tmp['location'][1:] tmp['summary'] = hint.highlights( "summary", minscore=0 ) #hint["content"].replace(key,"<code>%s</code>" % key) resPage['posts'].append(tmp) return resPage
def indexquery(name, www): if name == None: return [] #print("Name: %s" % name) ix = index.open_dir("/var/www/restnames/index") qp = MultifieldParser([ "commonname", "database", "tags", "name", "name_part", "country", "project", "url" ], schema=ix.schema, termclass=FuzzyTerm) qp.add_plugin(qparser.FuzzyTermPlugin()) q = qp.parse(name) #q = Every() tempvar = [] with ix.searcher() as searcher: results = searcher.search(q, limit=None) for hit in results: tempvar.append({ 'name': hit["name"], 'commonname': hit["commonname"], 'url': hit["url"] }) if not www: return tempvar else: response = Response( render_template("searchresults.html", resultlist=tempvar)) response.headers['content-type'] = 'text/html' return response
def getSpecialCasesResults(self, value, translation): """Takes in a query and compares it to hard-coded special cases. The special cases are for the "Miracle Letters" :param translation: The requested translation type :type translation: str :return: A list of ayah matches if there is a match, otherwise returns None :rtype: list, None """ matchingAyahList = [] for case in SPECIAL_CASES: if case[0] == value: value = case[1] matchingAyahList = case[2] if len(matchingAyahList) > 0: allowedResults = [] for matchingAyah in matchingAyahList: allowedResults.append( "surah_num:" + str(matchingAyah[0]) + " AND ayah_num:" + str( matchingAyah[1])) parser = MultifieldParser(["surah_num", "ayah_num"], self._ix.schema) parser.remove_plugin_class(PhrasePlugin) parser.add_plugin(SequencePlugin()) query = parser.parse(" OR ".join(allowedResults)) with self._ix.searcher() as searcher: results = searcher.search(query, limit=7) return self._getResponseObjectFromParams( value, self._getMatchesFromResults(results, translation), [], [] ) else: return None
def search(self, queryEntered, page): title = list() plot = list() poster = list() year = list() director = list() genre = list() actors = list() tomato_score = list() # JY for the sake of demonstrating ranking weight, not going to affect search much visibly. #mw=MultiWeighting(BM25F(), tomato_score=FunctionWeighting(custom_weight)) # plot=BM25F(B=0.75, plot_B=1.0, K1=2.0), actors=BM25F(B=0.75, actors_B=1.0, K1=1.5), director=TF_IDF() ) with self.indexer.searcher(weighting=BM25F()) as search: parser = MultifieldParser(['title', 'plot','actors', 'director', 'genre'], schema=self.indexer.schema, termclass=FuzzyTerm) # parser.add_plugin(plugins.FuzzyTermPlugin()) parser.add_plugin(plugins.SequencePlugin()) query = parser.parse(queryEntered) results = search.search_page(query, page, 20, sortedby = {'tomato_score'}, reverse=True) # 'tomato_score', 'year' for x in results: title.append(x['title']) plot.append(x['plot']) poster.append(x['poster']) tomato_score.append(x['tomato_score']) year.append(x['year']) director.append(x['director']) actors.append(x['actors']) genre.append(x['genre']) return title, plot, poster, tomato_score, year, actors, director, genre, results.pagecount if results.pagecount < 23 else 23
def generic(idx, qs=None, q=None, limit=5, parser=None, page=1): if qs is q is None: raise ValueError('cannot have a null querystring and query') if parser is None: parser = MultifieldParser( ['title', 'keywords', 'summary', 'content', 'author'], idx.schema, group=OrGroup) # add better date parsing support parser.add_plugin(DateParserPlugin()) parser.remove_plugin_class(WildcardPlugin) with idx.searcher() as search: # generate the Query object if qs: query = parser.parse(qs) else: query = q facet = MultiFacet() facet.add_score() facet.add_field('modified', reverse=True) facet.add_field('title') results = search.search_page(query, pagenum=page, sortedby=facet, pagelen=limit) res = clean_results(idx, results, query) # pagination attributes on `search_page` method res.page_number = results.pagenum # current page number res.page_total = results.pagecount # total pages in results res.offset = results.offset # first result of current page res.pagelen = results.pagelen # the number of max results per page return res
def get_whoosh_parser(index): from whoosh.qparser import MultifieldParser, GtLtPlugin # TODO: only active columns term_fields = ['content', 'unitid'] parser = MultifieldParser(term_fields, index.schema) parser.add_plugin(GtLtPlugin) return parser
def _create_parser(self, context): parser = MultifieldParser(self.field_boosts.keys(), WhooshBackend.SCHEMA, fieldboosts=self.field_boosts) parser.add_plugin( MetaKeywordPlugin(meta_keyword_parsers=self.meta_keyword_parsers, context=context)) return parser
def _query_keys(self, query, limit=None, plugin=None): parser = MultifieldParser(self.subscription.table.__searchable__, self.subscription.schema) if plugin: parser.add_plugin(plugin) pk = self.subscription.primary_key results = self.subscription.index.searcher().search(parser.parse(query), limit=limit) keys = [x[pk.name] for x in results] return keys
def search_index(index_dir, schema, attributes, id_name, query): ix = index.open_dir(dir + "/" + index_dir) mqp = MultifieldParser(attributes, schema=schema, group=OrGroup) mqp.add_plugin(FuzzyTermPlugin) q = mqp.parse("*%s~3/2*" % (query)) with ix.searcher() as s: results = s.search(q) return [r[id_name] for r in results]
def person_query_search(indexdir, queries, user_id, E, n=10, function='BM25F'): prediction = user_cf(E, user_id, 3) ix = index.open_dir(indexdir) search_fields = ['resname', 'categories', 'address', 'city', 'state'] # search fields og = qparser.OrGroup.factory(0.9) qp = MultifieldParser(search_fields, ix.schema, termclass=query.Variations, group=og) qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(queries) result_index = [] if function == 'BM25F': # with ix.searcher(weighting=scoring.BM25F(B=0.75, resname_B = 1.0, categories_B = 0.8, K1=1.2)) as s: # add weight for the resname and the categories_B with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s: scores = sorting.ScoreFacet() results = s.search(q, limit=None, sortedby=[scores]) m = len(results) if m != 0: relevance = np.zeros(m) expected = np.zeros(m) for i in range(m): relevance[i] = -results[i].score relevance = (relevance - relevance.min()) / (relevance.max() - relevance.min()) # normalized score from 0 to 1 for i in range(m): expected[i] = relevance[i] * prediction[int( results[i]['ID'])] indorder = np.argsort(expected) k = min(m, n) for i in range(k): result_index.append(int(results[indorder[-1 - i]]['ID'])) if function == 'TF_IDF': with ix.searcher(weighting=scoring.TF_IDF()) as s: scores = sorting.ScoreFacet() results = s.search(q, limit=m, sortedby=[scores]) m = len(results) if m != 0: relevance = np.zeros(m) expected = np.zeros(m) for i in range(m): relevance[i] = -results[i].score relevance = (relevance - relevance.min()) / ( relevance.max() - relevance.min() ) # normalized score from 0 to 1 for i in range(m): expected[i] = relevance[i] * prediction[int( results[i]['ID'])] indorder = np.argsort(expected) k = min(m, n) for i in range(k): result_index.append(int(results[indorder[-1 - i]]['ID'])) return result_index
def answer_query(query): with main_index.searcher() as searcher: parser = MultifieldParser(['title', 'summary'], main_index.schema, fieldboosts={'title': 5.0, 'summary': 0.2}) parser.add_plugin(FuzzyTermPlugin()) # tilde adds fuzzy parsing for 1 character and /1 requires the first letter to match query = parser.parse(unicode(query) + '~/1') results = searcher.search(query, limit=100) tags = [r['tag'] for r in results] return tags
def render_GET(self, request): section_path = '/'.join(request.postpath).strip('/') if not section_path: defer.returnValue(json.dumps({'status': 'error', 'message': 'unable to search root'})) section_name = request.postpath[0] ix = self._get_index(section_path) if not ix: defer.returnValue(json.dumps({'status': 'error', 'message': 'unknown index for %s' % section_path})) schema_settings = self._get_schema_settings(section_path) schema = schema_settings['schema'] if 'schema' in request.args: if section_path in self.currently_indexing: yield self.currently_indexing[section_path] field_choices = schema_settings.get('field_choices', {}) fields = {} for field in set(schema.names()): if isinstance(schema[field], KEYWORD) and field in field_choices: fields[field] = sorted(x for x in field_choices[field] if x) defer.returnValue(json.dumps({'status': 'ok', 'schema': fields, 'type': schema_settings['type']})) if 'q' not in request.args: defer.returnValue(json.dumps({'status': 'error', 'message': 'missing q argument in url'})) q = unicode(request.args['q'][0]) parser = MultifieldParser(['search_field'], schema=schema) parser.add_plugin(GtLtPlugin()) query = parser.parse(q) with ix.searcher() as searcher: results = yield threads.deferToThread(searcher.search, query, limit=10000) #corrected = searcher.correct_query(query, q) # jesus this is bad for titles results = [x['linkitem'] for x in results] section = settings.SECTIONS[section_name] rootfolder = RootFolder(parent_path='', name='Search result for: %s' % q, urlname=self.name, date=0) rootfolder['content_type'] = section.levels[0].content_type for result in results: rootfolder.add_item(result) #if corrected.query != query: # retval['suggestion'] = { # 'rel': 'suggested_query', # 'href': urlparse.urljoin(settings.BASE_URL, '/search/%s' % urllib.quote(section_path)) + '?%s' % urllib.urlencode({'q': corrected.string}), # 'suggested_query': corrected.string, # } defer.returnValue(rootfolder.serialize())
def _create_parser(self, context): parser = MultifieldParser( self.field_boosts.keys(), WhooshBackend.SCHEMA, fieldboosts=self.field_boosts ) parser.add_plugin( MetaKeywordPlugin(meta_keyword_parsers=self.meta_keyword_parsers, context=context) ) return parser
def query(self, s=None, is_curated=True, is_fiction=True, pagenum=1, allpages=False): ''' Search for books using whoosh, or return first page from all ''' if self.whoosh is None: return if not s: # default to list all authors query = Every('author') else: # create a search by author and title qp = MultifieldParser(['author', 'title'], self.whoosh.schema, group=OrGroup) # fuzzy query only if wildcard not present if '*' not in s: qp.add_plugin(FuzzyTermPlugin()) # setup search terms for fuzzy match fuzzy_terms = [] for w in s.split(): fuzzy_terms.append('{}~'.format(w)) s = ' '.join(fuzzy_terms) # parse the search terms query = qp.parse(s) # only filter is_fiction / is_curated when true filters = [] if is_curated is True: filters.append(Term('is_curated', is_curated)) if is_fiction is True: filters.append(Term('is_fiction', is_fiction)) qfilter = And(filters) with self.whoosh.searcher() as searcher: pagecount = None if allpages: # special search returning all pages upto pagenum results = searcher.search(query, filter=qfilter, limit=(self.pagelen * pagenum)) else: # paginated search for specific page, or to feed infinite scroll results = searcher.search_page(query, int(pagenum), filter=qfilter, pagelen=self.pagelen) pagecount = results.pagecount output = [item.fields() for item in results] if pagecount is None: pagecount = int(math.ceil(float(len(output)) / self.pagelen)) return {'results': output, 'pagecount': pagecount}
def _query(self): q_str = self.query_params['query'] qp = MultifieldParser( ["content", "title", "correspondent", "tag", "type"], self.searcher.ixreader.schema) qp.add_plugin(DateParserPlugin()) q = qp.parse(q_str) corrected = self.searcher.correct_query(q, q_str) if corrected.query != q: corrected_query = corrected.string return q, None
def inicia(): pth = os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/indiceJuego") if not os.path.exists(pth): os.mkdir(pth) esquemaJuego = Schema(titulo=KEYWORD(stored=True), descripcion=TEXT, categorias=KEYWORD(stored=True), plataformas=KEYWORD(stored=True), precio=NUMERIC(stored=True)) indiceJuego = create_in("indiceJuego",esquemaJuego) else: indiceJuego = open_dir(pth) parser = MultifieldParser(["titulo"], schema=indiceJuego.schema) parser.add_plugin(FuzzyTermPlugin()) return indiceJuego,parser
class DBworldSearcher: def __init__(self, indexdir, fieldlist=["subject", "content"]): self.indexdir = indexdir ix = open_dir(indexdir) #self.parser = QueryParser("subject", self.ix.schema) self.parser = MultifieldParser(fieldlist, ix.schema) self.parser.add_plugin(DateParserPlugin()) self.searcher = ix.searcher() def search(self, querytext, limit): myquery = self.parser.parse(querytext) results = self.searcher.search(myquery, limit=limit) return results
def search_in_index(search_kw, index): parser = MultifieldParser(["content", "title"], index.schema) parser.add_plugin(FuzzyTermPlugin()) searcher = index.searcher() to_parse = ' '.join([i + '~0' for i in search_kw.split(' ')]) myquery = parser.parse(to_parse) r = searcher.search(myquery) results = [] for res in r: results.append(res['path']) corrector = searcher.corrector("content") suggestions = [] for kw in search_kw.split(' '): suggestions.append(corrector.suggest(kw)) searcher.close() return results, suggestions
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content): searcher = ix.searcher() try: if querystring: qp = MultifieldParser( ["content", "title", "correspondent", "tag", "type"], ix.schema) qp.add_plugin(DateParserPlugin()) str_q = qp.parse(querystring) corrected = searcher.correct_query(str_q, querystring) else: str_q = None corrected = None if more_like_doc_id: docnum = searcher.document_number(id=more_like_doc_id) kts = searcher.key_terms_from_text('content', more_like_doc_content, numterms=20, model=classify.Bo1Model, normalize=False) more_like_q = query.Or([ query.Term('content', word, boost=weight) for word, weight in kts ]) result_page = searcher.search_page(more_like_q, page, filter=str_q, mask={docnum}) elif str_q: result_page = searcher.search_page(str_q, page) else: raise ValueError( "Either querystring or more_like_doc_id is required.") result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() if corrected and corrected.query != str_q: corrected_query = corrected.string else: corrected_query = None yield result_page, corrected_query finally: searcher.close()
def whoosh_search(self, query, searcher, index, info): ret = '' # run a whoosh search and display the hits # query applies to all fields in the schema # special query: ALL, ANY # limit = int(self.options['limit'] or '1000000') if query in ['ALL', 'ANY']: from whoosh.query.qcore import Every q = Every() else: from whoosh.qparser import MultifieldParser, GtLtPlugin # TODO: only active columns term_fields = [item[0] for item in index.schema.items()] parser = MultifieldParser(term_fields, index.schema) parser.add_plugin(GtLtPlugin) q = parser.parse(u'%s' % query) if query in ['ANY']: limit = 1 afield = self.options['field'] res = searcher.search(q, limit=limit) vs = {} for hit in res: if afield: # display only the unique value in the requested field vs[hit[afield]] = vs.get(hit[afield], 0) + 1 else: # display all field, value in this record for k, v in hit.iteritems(): ret += '\t%-20s %s\n' % (k, repr(v)[0:30]) ret += '\t' + ('-' * 20) + '\n' if vs: for v, c in vs.iteritems(): ret += '\t%6s x %s\n' % (c, repr(v)) info['results'] = ret info['result_size'] = len(res) ret += '\n\n%s documents found' % len(res) return ret
def search(self, query_list, fields=None): with self.ix.searcher() as searcher: query_list2 = [] for qq in query_list: if qq=='AND' or qq=='OR': query_list2.append(qq) else: query_list2.append(qq.lower()) query_string = " ".join(query_list2) query = None if ":" in query_string: # If the user DOES specify a field, # setting the fields determines what fields # are searched with the free terms (no field) fields = ['title', 'content','owner_name','owner_email','github_user'] query = MultifieldParser(fields, schema=self.ix.schema) est = pytz.timezone('America/New_York') query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow()))) query.add_plugin(GtLtPlugin()) try: query = query.parse(query_string) except: # Because the DateParser plugin is an idiot query_string2 = re.sub(r':(\w+)',':\'\g<1>\'',query_string) try: query = query.parse(query_string2) except: print("parsing query %s failed"%(query_string)) print("parsing query %s also failed"%(query_string2)) query = query.parse('') else: # If the user does not specify a field, # these are the fields that are actually searched fields = ['url','title', 'content','owner_name','owner_email','github_user'] query = MultifieldParser(fields, schema=self.ix.schema) est = pytz.timezone('America/New_York') query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow()))) query.add_plugin(GtLtPlugin()) try: query = query.parse(query_string) except: print("parsing query %s failed"%(query_string)) query = query.parse('') parsed_query = "%s" % query print("query: %s" % parsed_query) results = searcher.search(query, terms=False, scored=True, groupedby="kind") search_result = self.create_search_result(results) return parsed_query, search_result
def get(self, search): ix = open_dir("index") with ix.searcher() as searcher: qp = MultifieldParser(['title', 'content', 'url'], ix.schema) qp.add_plugin(DateParserPlugin()) query = qp.parse(search) results = searcher.search(query) self.write( tornado.escape.json_encode([{ 'title': r.get('title'), 'url': r.get('url'), 'date': r.get('date').strftime("%A, %d. %B %Y %I:%M%p"), 'hash': r.get('hash', 'blank') } for r in results[:10]])) self.set_header('Content-Type', 'application/json')
def search_for_track(self, querystring): if len(querystring) >= 3: with self.ix.searcher() as searcher: collector = searcher.collector(limit=20) tlc = TimeLimitCollector(collector, timelimit=1.4, use_alarm=False) parser = MultifieldParser(["artist", "album", "title"], self.ix.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) myquery = parser.parse(querystring) try: searcher.search_with_collector(myquery, tlc) if len(tlc.results()) == 0: myquery = parser.parse(" ".join(word + "~2" for word in querystring.split())) searcher.search_with_collector(myquery, tlc) except TimeLimit: logging.info("Time Limit for query reached!") logging.debug("czas zapytania: ", collector.runtime) ret = [self.__tracks[int(result["id"])] for result in tlc.results()] return ret else: return []
def page(self, page, limit): with self.engine.index.searcher() as searcher: parser = MultifieldParser( self.engine.search_fields, schema = self.engine.index.schema, ) parser.add_plugin(GtLtPlugin()) parser.add_plugin(PhrasePlugin()) parser.add_plugin(FieldsPlugin()) #parser.remove_plugin_class(WildcardPlugin) #parser.add_plugin(WildcardPlugin()) parser.add_plugin(PrefixPlugin()) whoosh_query = parser.parse(self.query.toString(self.engine)) #print "============" + str(whoosh_query) results = searcher.search_page(whoosh_query, page, limit, sortedby = self.order) self.rows = results.total _results = [] doc_class = self.engine.database.document for result in results: doc = doc_class(data = {field: result.get(field, None) for field in self.engine.stored_fields}, restore = True) _results.append(doc) return _results
def query_page(ix, querystring, page): searcher = ix.searcher() try: qp = MultifieldParser( ["content", "title", "correspondent", "tag", "type"], ix.schema) qp.add_plugin(DateParserPlugin()) q = qp.parse(querystring) result_page = searcher.search_page(q, page) result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() corrected = searcher.correct_query(q, querystring) if corrected.query != q: corrected_query = corrected.string else: corrected_query = None yield result_page, corrected_query finally: searcher.close()
def query_parser(self, default_fields, idx_name=LATEST_REVS): """ Build a query parser for a list of default fields. """ schema = self.schemas[idx_name] if len(default_fields) > 1: qp = MultifieldParser(default_fields, schema=schema) elif len(default_fields) == 1: qp = QueryParser(default_fields[0], schema=schema) else: raise ValueError("default_fields list must at least contain one field name") qp.add_plugin(RegexPlugin()) def username_pseudo_field(node): username = node.text users = user.search_users(**{NAME_EXACT: username}) if users: userid = users[0].meta['userid'] node = WordNode(userid) node.set_fieldname("userid") return node return node qp.add_plugin(PseudoFieldPlugin({'username': username_pseudo_field})) return qp
def search_in_index(search_kw, index): ''' search_kw: ce que rentre l'utilisateur dans la barre de recherche index: l'index ouvert (objet ix dans le code qui précède) La fonction renvoie un dictionnaire avec pour clefs: - results: une liste contenant des dictionnaires. Chaque dictionnaire correspond à un résultat de recherche. Le premier élément de la liste est le meilleur résultat. Les dictionnaires on deux clefs: 'title' avec le titre du doc, et 'path' avec le chemin (vers le doc texte, pour le moment) - suggestions: dictionnaire de suggestion qui propose une éventuelle correction pour chaque mot entré par l'utilisateur. A voir comment on mélange les suggestions des différents mots pour fournir des suggestions complètes ''' #on utilise un MultifieldParser pour rechercher à la fois dans le titre et dans le contenu parser = MultifieldParser(["content", "title"], index.schema) #on rajoute un plugin de FuzzyMatching pour pouvoir chercher au delà des mots exacts parser.add_plugin(FuzzyTermPlugin()) searcher = index.searcher() #on transforme la requête utilisateur pour la mettre en format compréhensible par le plugin de FuzzyMatching to_parse = ' '.join([i + '~1' for i in search_kw.split(' ')]) myquery = parser.parse(to_parse) #on récupère les résultats pour pouvoir fermer le searcher par la suite r = searcher.search(myquery) results = [] for res in r: results.append({'title': res['title'], 'path': res['path']}) #on set-up le correcteur et on stock ce qu'il propose pour chaque mot tapé corrector = searcher.corrector("content") suggestions = {} for kw in search_kw.split(' '): suggestions[kw] = corrector.suggest(kw) #on ferme le seacher searcher.close() return {'results': results, 'suggestions': suggestions}
def search(self, queries, fuzzy = True, default_fields = [], max_results = None): if type(queries) != list: queries = [queries] if type(default_fields) != list: default_fields = [default_fields] if fuzzy and len(queries) == 1 and len(queries[0].split()) == 1 and ':' not in queries[0] and '*' not in queries[0]: queries = ['*%s*' % (queries[0])] for query in queries: if type(query) != unicode: query = query.decode('utf-8') log.msg('search query: %s' % (query)) with self.ix.searcher() as searcher: parser = MultifieldParser(default_fields, self.ix.schema) parser.remove_plugin_class(plugins.WildcardPlugin) parser.add_plugin(WildcardPlugin) query = parser.parse(query) log.msg('search query parsed: %s' % (query)) results = searcher.search(query, limit = None) count = 0 for result in results: yield result['oid'] count += 1 if max_results and count >= max_results: break
class ConceptSearcher: def __init__(self, tree_dict, tree_identity): self.ix = None self.parser = None self.id_ = tree_identity self._tree_dict = tree_dict self.get_schema() def search(self, query_string, limit=50, allowed_nodes: set = None): with self.ix.searcher() as searcher: query = self.parser.parse(query_string) if allowed_nodes is not None: allowed_nodes = { doc_num for doc, doc_num in zip(searcher.documents(), searcher.document_numbers()) if doc.get('fullname') in allowed_nodes } results = searcher.search(query, limit=limit, filter=allowed_nodes) return [r['fullname'] for r in results] def get_schema(self): schema_dir = os.path.join(cache_dir, self.id_) os.makedirs(schema_dir, exist_ok=True) if exists_in(schema_dir) and open_dir(schema_dir).doc_count() != 0: self.ix = open_dir(schema_dir) print('Existing index cache found. Loaded {} tree nodes. Hooray!'. format(self.ix.doc_count())) else: print('No valid cache found. Building indexes...') now = time.time() self.__build_whoosh_index(schema_dir) print('Finished in {:.2f} seconds'.format(time.time() - now)) self.parser = MultifieldParser(self.ix.schema.names(), schema=self.ix.schema) self.parser.add_plugin(FuzzyTermPlugin()) def __build_whoosh_index(self, schema_dir): fields = dict( node=TEXT(), fullname=TEXT(stored=True), path=TEXT(), type=NGRAM(minsize=4), study=NGRAM(field_boost=10.0), name=NGRAMWORDS(minsize=3, field_boost=3.0), metadata=NGRAMWORDS(minsize=3), ) schema = Schema(**fields) self.ix = create_in(schema_dir, schema) with self.ix.writer(procs=2, multisegment=True, limitmb=512) as writer: for key, value in self._tree_dict.items(): writer.add_document(node=key.replace('\\', ' ').replace('_', ' '), path=value.get('conceptPath'), fullname=key, type=value.get('type'), study=str(value.get('studyId')), name=str(value.get('name')), metadata=str(value.get('metadata')))
from whoosh.index import open_dir from whoosh.qparser import MultifieldParser from whoosh.qparser.dateparse import DateParserPlugin ix = open_dir("index") with ix.searcher() as searcher: qp = MultifieldParser(['title', 'content', 'url'], ix.schema) qp.add_plugin(DateParserPlugin()) query = qp.parse("qoqa") results = searcher.search(query, terms=True) for r in results: print(r.get('hash')) print(r.matched_terms()) #print(r.highlights('title'))
_string = sys.argv[1] _mode = sys.argv[2] normal = (_mode == "normal") _distance = 0 if (normal is False): _distance = int(sys.argv[3]) with ix.searcher() as searcher: # og = qparser.OrGroup.factory(0.9) parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema) # parser = qparser.QueryParser("content", ix.schema) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) if (normal): string = _string query = parser.parse(string) else: # proximity distance = _distance proximty_query = "\"" + _string + "\"" + '~' + str((1 + distance) * 3) query = parser.parse(proximty_query) # sys.stdout.buffer.write(query) sys.stdout.buffer.write(">>>>>>OUTPUT start<<<<<<".encode('utf-8')) results = searcher.search(query, limit=20) results.fragmenter.maxchars = 100 # Show more context before and after
class WhooshEngine(Engine): def __init__(self, config): self.schema = Schema( id=ID(unique=True), title=TEXT(stored=True, field_boost=3.0, analyzer=StandardAnalyzer() | NgramFilter(minsize=2, maxsize=3)), author=TEXT(stored=True), creation_date=DATETIME(stored=True), pages=STORED, content=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)), lang=TEXT(stored=True), size=STORED, tags=KEYWORD(stored=True, commas=True) ) self.index_path = config['WHOOSH_INDEX'] if not os.path.exists(self.index_path): os.mkdir(self.index_path) create_in(self.index_path, self.schema) self.indexer = open_dir(self.index_path) self.parser_content = MultifieldParser(["title", "content"], schema=self.schema) self.parser_content.add_plugin(DateParserPlugin()) self.date_format = { 'last_24h': u'-24h to now', 'last_week': u'last week', 'last_month_to_now': u'-1mo to now', 'last_year_to_now': u"[-2yrs to now]" } def index_document(self, data): """ Index a document :param data: set of data values per attributes to index :return: void """ writer = self.indexer.writer() writer.add_document( id=unicode(data['Id']), author=data['Author'], title=data['Title'], creation_date=data['CreationDate'], content=unicode(data['Content'], 'utf-8'), pages=data['NumPages'], size=data['Size'], lang=data['Lang'], tags=unicode(','.join(data['Tags']), 'utf-8') ) writer.commit() def update_document(self, id, data): """ Update the documents attributes :param id: document's identifier :param data: set of values per attributes :return: void """ writer = self.indexer.writer() data['id'] = unicode(id) writer.update_document(**data) writer.commit() def delete_document(self, id): """ Delete a document by id :param id: document's identifier :return: void """ self.indexer.delete_by_term('id', unicode(id)) self.indexer.commit() def delete_index(self): """Delete de current index""" create_in(self.index_path, schema=self.schema) def search_ngram(self, args): """ Get n-gram result ,when you typing it show the result :param args: The field to query like a title by example :return data: a result list that matched """ criteria = args['criteria'] with self.indexer.searcher() as searcher: simple_parser = QueryParser("title", group=OrGroup, schema=self.schema) query = simple_parser.parse(criteria) response = searcher.search_page(query, pagenum=1) data = [result['title'] for result in response.results] return data def search_document(self, args): """ Search all documents that mach with the query :param args: params from request to query :return: query result with pagination """ with self.indexer.searcher() as searcher: query = self.parser_content.parse(args['criteria']) filters = self.create_filters(args) response = searcher.search_page(query, pagenum=args['page'], filter=filters) return self.normalize_data(response) def create_filters(self, args): """ :param args: params from request to filter :return: Search instance """ allow_list = [] if "creation_date" in args: # :creation date format ex: creation_date:[last week to now] date_query = 'creation_date:' + self.date_format[args["creation_date"]] allow_list.append(date_query) if "lang" in args: # :lang format ex: (lang:es OR lang:en) lang_list = ['lang:' + lang for lang in args['lang']] lang_query = ' OR '.join(lang_list) allow_list.append('(' + lang_query + ')') if "author" in args: # :autor format ex: (author:Antonio) author_query = 'author:' + args['author'] allow_list.append('(' + author_query + ')') if "tags" in args: # :tags format ex: (tag:'history' OR tag:'docker') tag_list = ['tags:' + tag for tag in args['tags']] tag_query = ' OR '.join(tag_list) allow_list.append('(' + tag_query + ')') # : all filters query_string = u' AND '.join(allow_list) return self.parser_content.parse(query_string) if query_string else None def normalize_data(self, response): """ Normalize the response adding pagination :param response: Response from elastic search :return: data normalized """ data = {'items': [], 'id_list': []} response.results.fragmenter.surround = 80 #: summary length # page_result.results.fragmenter.maxchars = 300 my_cf = highlight.SentenceFragmenter() # page_result.results.fragmenter = my_cf for result in response.results: # print result.title result_dict = dict(result) result_dict['summary'] = result.highlights("content", top=2) data['items'].append(result_dict) data['id_list'].append(int(result_dict['id'])) data['total'] = response.total data['pages'] = response.pagecount data['page'] = response.pagenum return data def rebuild_index(self): shutil.rmtree(self.index_path) os.mkdir(self.index_path) create_in(self.index_path, schema=self.schema)
# coding=utf-8 from whoosh.index import open_dir from whoosh.qparser import MultifieldParser from whoosh.qparser import FuzzyTermPlugin idx_dir = 'lagou_idx' ix = open_dir(idx_dir) searcher = ix.searcher() parser = MultifieldParser(["name", "desc"], schema=ix.schema) parser.add_plugin(FuzzyTermPlugin()) # Single field parser. k = u'搜索 OR Pythn~2 city:上海' q = parser.parse(k) results = searcher.search_page(q, 1, pagelen=5) print(u'{0} results found for keyword {1}, {2} returned: '.format(len(results), k, results.scored_length())) for hit in results[:50]: print(hit['id']) print(hit['name']) # print(hit['city']) print(hit['com_name']) print('************')
def main(): """ The main loop for the program """ g = Gui() ix = index.open_dir("indexdir") while True: event, values = g.window.read() g.window["_output_"]('') # close windows if event is None: break if event == '_SEARCH_' and values['TERM'] is not None: # il parametro 'fieldboosts' regola quanta importanza dare ai match nei vari campi qp = MultifieldParser( ["procTitle", "topics", "categories", "procContent"], termclass=Variations, schema=ix.schema, fieldboosts={ "procTitle": 1.5, "categories": 1.3 }) qp.add_plugin(FuzzyTermPlugin()) terms = str(values['TERM']) terms = terms.replace("title", "procTitle").replace("topic", "topics") \ .replace("category", "categories").replace("content", "procContent") # Modifica della query immessa con aggiunta dei sinonimi nel caso l'opzione sia abilitata, con attenzione # al riportare i token booleani senza modifiche ed a tradurre correttamente la definizione dei campi in cui # ricercare i termini se richiesti. if values['syn_search']: with open("utils/wn_s.pl", "r") as f: thesaurus = Thesaurus.from_file(f) termsWithSynonyms = [] for term in terms.split(" "): field = None if ":" in term: field = term.split(":")[0] term = term.split(":")[1] if term not in booleanTokens: termSynonyms = thesaurus.synonyms(term) if field is not None: termSynonyms = [ f"{field}:{t}" for t in termSynonyms ] termSynonyms.append(f"{field}:{term}") else: termSynonyms.append(term) termsWithSynonyms.append(" OR ".join(termSynonyms)) else: termsWithSynonyms.append(term) terms = ' '.join(termsWithSynonyms) print("- Searching for >>> " + str(terms)) # stemming dei termini della query e aggiunta della tilde per ricerca "fuzzy" a quelle effettivamente modificate words = terms.split(' ') stemmedWords = list() for word in words: stemmed = stem(word) if word != stemmed: stemmedWords.append(stemmed + '~') else: stemmedWords.append(stemmed) q = qp.parse(' '.join(stemmedWords)) with ix.searcher() as searcher: if not values['syn_search']: correction = searcher.correct_query(q=q, qstring=terms, maxdist=2) if terms != correction.string: print("- Did you mean >>> " + correction.string) results = searcher.search(q, terms=True) if not values['syn_search'] and results.is_empty(): print( "- No relevant result has been found for query, trying corrected query" ) results = searcher.search(qp.parse(correction.string)) numb = 1 if not results.is_empty(): for elem in results: # print(elem) print( f"Result n.{numb} >>> Title: {str(elem['docTitle'])}\n\tScore: {str(elem.score)}\n" f"\tLink to the page: {str(elem['pageUrl'])}\n") numb += 1 else: print("- No relevant result has been found")
class BockCore(): def __init__(self, articles_path): """Attempt to initialize a folder with Markdown articles. If a git repo, create a search index and populate. Markdown Extension References * http://facelessuser.github.io/pymdown-extensions * https://pythonhosted.org/Markdown/extensions """ self.article_repo = Repo(articles_path) self.articles_path = articles_path self.markdown_extensions = [ 'markdown.extensions.abbr', 'markdown.extensions.attr_list', 'markdown.extensions.def_list', 'markdown.extensions.fenced_code', 'markdown.extensions.footnotes', 'markdown.extensions.tables', 'markdown.extensions.smart_strong', 'markdown.extensions.admonition', 'markdown.extensions.codehilite', 'markdown.extensions.headerid', 'markdown.extensions.sane_lists', 'markdown.extensions.smarty', 'markdown.extensions.toc', 'markdown.extensions.wikilinks', 'pymdownx.betterem', 'pymdownx.caret', 'pymdownx.githubemoji', 'pymdownx.headeranchor', 'pymdownx.magiclink', 'pymdownx.mark', 'pymdownx.smartsymbols', 'pymdownx.tasklist', 'pymdownx.tilde', 'pymdownx.critic', ] self.markdown_extensions_config = { 'markdown.extensions.codehilite': { 'css_class': 'code-highlight' } } self.__search_schema = Schema( title=ID(stored=True, unique=True), path=ID(stored=True), content=TEXT, ) self.__search_parser = MultifieldParser( ['title', 'content'], schema=self.__search_schema, ) self.__search_parser.add_plugin(FuzzyTermPlugin()) self.__search_index = self.create_search_index() self.populate_search_index() # ------------------------ Article Functions ------------------------ def markdown_to_html(self, text): """Converts a given Markdown string to HTML """ return markdown.markdown( text=text, output_format='html5', extensions=self.markdown_extensions, extension_configs=self.markdown_extensions_config, ) def raw_article(self, article_path): """Return the text contents of an article """ with open(self.full_article_path(article_path)) as f: article_content = f.read() return article_content def processed_article(self, article_path): """Return the 'marked-down' HTML version of an article """ return self.markdown_to_html(self.raw_article(article_path)) def article_last_modified(self, article_path): """Return the last modified date of a given article in ISO8601 format """ return str( arrow.get( os.stat( self.full_article_path(article_path) ).st_mtime ) ) def article_last_modified_human(self, article_path): """Return the last modified date of a given article in a human-readable format """ return arrow.get( self.article_last_modified(article_path) ).humanize() def is_article_modified(self, article_path): """Determine if the article is modified """ if not os.path.isfile(self.full_article_path(article_path)): raise FileNotFoundError if article_path in self.list_of_uncommitted_articles: return True return False def get_article(self, article_path): """A convenience method that returns an object with a single article and associated metadata """ return { 'title': self.article_title(article_path), 'html': self.processed_article(article_path), 'raw': self.raw_article(article_path), 'modified': self.article_last_modified(article_path), 'modified_humanized': self.article_last_modified_human( article_path ), 'uncommitted': self.is_article_modified(article_path), } @property def simple_list_of_articles(self): """Return a simple list of articles """ return sorted([ re.sub( r'^/?', '', _.replace(self.articles_path, '').replace('.md', '') ) for _ in glob('{}/**/*.md'.format(self.articles_path)) ]) @property def list_of_articles(self): """Return a simple list of articles with information on whether they've been modified """ uncommitted_list = self.list_of_uncommitted_articles simple_list = self.simple_list_of_articles return [ { 'title': _, 'uncommitted': True if _ in uncommitted_list else False } for _ in simple_list ] @property def alphabetized_list_of_articles(self): """Return an alphabetized list of articles with information on whether they've been modified """ alphabetized_list = defaultdict(list) uncommitted_list = self.list_of_uncommitted_articles for _ in self.simple_list_of_articles: alphabetized_list[_[:1].upper()].append( { 'title': _, 'uncommitted': True if _ in uncommitted_list else False } ) return alphabetized_list @property def list_of_uncommitted_articles(self): """Return a list of articles that have been modified """ return [ _.a_path.replace('.md', '') for _ in self.article_repo.index.diff(None) ] def escape_html(self, text): html_escape_table = { '&': '&', '"': '"', "'": ''', '>': '>', '<': '<', } return ''.join(html_escape_table.get(c, c) for c in text) # ------------------------ Path Functions ------------------------ """ Paths come in as folder one/folder two/some article In `helpers`, that path is always referred to as `article_path`. Need functions to turn `article_path` into # Article namespace only folder one/folder two # Article title only some article # Article title with extension some article.md # Full path to article file /path/to/repo/folder one/folder two/some article.md """ def article_namespace(self, article_path): """Return only the article namespace without trailing slashes """ match = re.match(r'^(.*)/.*$', article_path) return match.group(1).lstrip('/') if match else '' def article_title(self, article_path): """Return just the article title without a namespace. """ match = re.match(r'^(.*/)?(.*)$', article_path) # TODO: Improve the regex and avoid this! if match.group(2)[-3:].upper() == '.MD': title = match.group(2)[:-3] else: title = match.group(2) return title def article_path_with_extension(self, article_path): """Silly, really: Just add a '.md' to the article's title """ return '{}.md'.format(article_path) def full_article_path(self, article_path): """Return the full path to the article on disk """ return '{}/{}/{}.md'.format( self.articles_path, self.article_namespace(article_path), self.article_title(article_path), ) def article_title_with_extension(self, article_path): """Return the article title from the URL path with the ".md" extension """ return '{}.md'.format(self.article_title(article_path)) # ------------------------ Repository Functions ------------------------ def get_commits(self, article_path): """Returns a list of commits as `Commit` objects for a given article title """ return [ _ for _ in self.article_repo.iter_commits( paths=self.article_path_with_extension(article_path) ) ] def get_commit(self, article_path, sha): """Fetches a single `Commit` object for a given article title and commit SHA """ commit = [ _ for _ in self.get_commits(article_path) if _.hexsha == sha ] return commit[0] if commit else None def get_blob(self, article_path, commit): """Get the git blob for a given commit and article title """ namespaces = article_path.split('/') if len(namespaces) == 1: blob = [ _ for _ in commit.tree.blobs if _.name == self.article_title_with_extension(article_path) ] else: subtree_with_blob = commit.tree[namespaces[0]] for namespace in namespaces[1:-1:]: subtree_with_blob = subtree_with_blob[namespace] blob = [ _ for _ in subtree_with_blob.blobs if _.name == self.article_title_with_extension(article_path) ] return blob[0] if blob else [] def get_revision_list(self, article_path): """Get a list of revision objects for a given article title """ revisions = [] for commit in self.get_commits(article_path): committed_date = arrow.get(commit.committed_date) revisions.append({ 'id': commit.hexsha, 'message': commit.message, 'author': commit.author.name, 'email': commit.author.email, 'committed': str(committed_date), 'committed_humanized': committed_date.humanize() }) return revisions def get_revision(self, article_path, sha): """Get a single revision from a blob object for a given article title and commit ID """ commit = self.get_commit(article_path, sha) if not commit: return None commit_date = arrow.get(commit.committed_date) blob = self.get_blob(article_path, commit) raw_article_content = ( blob.data_stream.read().decode('UTF-8').replace('\u00a0', '') if blob else self.raw_article(article_path) ) return { 'title': self.article_title(article_path), 'html': self.markdown_to_html(raw_article_content), 'raw': raw_article_content, 'committed': str(commit_date), 'committed_humanized': commit_date.humanize(), } def get_diff(self, article_path, a, b): """Return a diff string between two revisions of a given article title. """ revision_a = self.get_revision(article_path, a) revision_b = self.get_revision(article_path, b) unified_diff = '\n'.join( list( difflib.unified_diff( revision_a['raw'].splitlines(), revision_b['raw'].splitlines(), fromfile='{}/{}'.format('a', article_path), tofile='{}/{}'.format('b', article_path), lineterm='', ) ) ) diff_template = """diff --git a/{title} b/{title} index {sha_a}..{sha_b} {file_mode} {diff} """ unified_diff = diff_template.format( title=article_path, diff=unified_diff, sha_a=a[0:7], sha_b=b[0:7], file_mode=oct( os.stat(self.full_article_path(article_path)).st_mode )[2:] ) # Escape HTML and "non-breaking space" return self.escape_html(unified_diff) def pull_commits(self): """Pull all changes to the article repository from the default remote. An empty list denotes a successful pull. """ errors = [] try: self.article_repo.remote().pull() except Exception as e: errors.append(str(e)) return errors # ------------------------ Search Functions ------------------------ def create_search_index(self): """Create a search index in the articles path. The folder is named .search_index """ document_path = self.articles_path schema = self.__search_schema index_path = '{}/.search_index'.format(document_path) if not os.path.exists(index_path): os.mkdir(index_path) logger.info('Creating index') search_index = index.create_in(index_path, schema) return search_index def update_index_with(self, entity): """Update the search index with either a single article title or a list of titles """ writer = self.__search_index.writer() if type(entity) is not list: entity = [entity] for _ in entity: with open(self.full_article_path(_)) as f: try: writer.update_document( title=_, path=self.full_article_path(_), content=f.read() ) logger.debug('Updated {}'.format(self.article_title(_))) except ValueError as e: logger.error('Skipping {} ({})'.format(_, str(e))) writer.commit() def delete_from_index(self, article_path): logger.debug('Trying {}'.format(article_path)) writer = self.__search_index.writer() writer.delete_by_term('title', article_path) logger.debug('Removed {}'.format(article_path)) writer.commit() def populate_search_index(self): """Wraps the `update_index_with` function for the entire list of articles """ self.update_index_with(self.simple_list_of_articles) def search_articles(self, query_string): """Searches the index with the given query string and returns an object with search results and metadata """ if len(query_string) < 3: raise ValueError('Search query must be longer than three chars') search_results = { 'query': query_string, 'count': 0, 'results': None } query = self.__search_parser.parse(query_string) with self.__search_index.searcher() as searcher: results = searcher.search(query, terms=True, limit=None) results.fragmenter.maxchars = 400 results.fragmenter.surround = 100 search_results['count'] = len(results) if search_results['count'] > 0: search_results['results'] = [] for hit in results: search_results['results'].append({ 'title': hit['title'], 'content_matches': hit.highlights( 'content', text=open(hit["path"]).read() ) }) return search_results
class fetcher(object): def __init__(self, path): self.idxpath = path self.ix = open_dir(self.idxpath) self.query = MultifieldParser(['content','ctime'], schema=self.ix.schema) self.query.add_plugin(DateParserPlugin()) self.sorter = MultiFacet(["ctime", ScoreFacet()]) self.parser = ttp.Parser(); self.dateparser = parser.parser(); def fetch_thread_by_tid(self, retid): t1 = int(round(time.time() * 1000)) tweets = [] try : searcher = self.ix.searcher() results = searcher.documents(retweetid=retid) for r in results: tweet = json.loads(r['json']) tweet['created_at'] = self.dateparser.parse(tweet['created_at']) tweets.append(tweet) except Exception as e: print 'fetch_tweets error' + str(e) finally: searcher.close() t2 = int(round(time.time() * 1000)) tweets = sorted(tweets, key=lambda x: x['created_at'], reverse=False) print '----> fetch tweets by retweet id ' + str(t2 - t1) + ' ms' return tweets def fetch_tweets_by_uid(self, uid): t1 = int(round(time.time() * 1000)) try : searcher = self.ix.searcher() results = searcher.documents(ownerid=uid) tweets = [] for r in results: tweet = json.loads(r['json']) tweet['user']['retweet_at'] = self.dateparser.parse(tweet['created_at']) tweet['created_at'] = self.dateparser.parse(tweet['created_at']) tweets.append(tweet) except Exception as e: print 'fetch_tweets error' + str(e) finally: searcher.close() t2 = int(round(time.time() * 1000)) print '----> fetch tweets for the specified user costs ' + str(t2 - t1) + ' ms' return tweets def fetch_tweets_by_keyword(self, keyword, start, topk): print 'thread : ' + keyword t1 = int(round(time.time() * 1000)) tweets = [] users = [] tweetids = {} qtext = unicode('ctime:[' + str(start) + ' to] AND ' + 'content:(' + keyword + ')') try : searcher = self.ix.searcher() q = self.query.parse(qtext) results = searcher.search(q) for r in results: t = json.loads(r['json']) tt = t; if 'retweeted_status' in t and t['retweeted_status'] is not None: t = t['retweeted_status'] tid = t['id_str'] if tid not in tweetids: user = { "id":tt['user']['id_str'], "retweet_time":self.dateparser.parse(tt['created_at']).strftime('%Y%m%d%H%M%S'), "screen_name":tt['user']['screen_name'], "profile_image_url":tt['user']['profile_image_url'], "followers_count":tt['user']['followers_count'] }; users.append(user) tweet = {} tweet['id'] = tid tweet['text'] = t['text'] tweet['creator'] = {} tweet['creator']['id'] = t['user']['id_str'] tweet['creator']['creator'] = t['user']['screen_name'] tweet['creator']['creator_img'] = t['user']['profile_image_url'] tweet['retweet_count'] = t['retweet_count'] tweet['created_at'] = self.dateparser.parse(t['created_at']).strftime('%Y%m%d%H%M%S') tweet['retweet_history'] = [user] tweet['rank'] = max(t['user']['followers_count'], tt['user']['followers_count']) * t['retweet_count'] tweetids[tid] = tweet tweets.append(tweet) else : user = { "id":tt['user']['id_str'], "retweet_time":self.dateparser.parse(tt['created_at']).strftime('%Y%m%d%H%M%S'), "screen_name":tt['user']['screen_name'], "profile_image_url":tt['user']['profile_image_url'], "followers_count":tt['user']['followers_count'] }; users.append(user) tweetids[tid]['retweet_history'].append(user) tweetids[tid]['rank'] = max(tweetids[tid]['rank'], tt['user']['followers_count'] * t['retweet_count']) print '--> update retweet history' tweets = sorted(tweets, key=lambda x: x['rank'], reverse=False)[:topk] tweets = sorted(tweets, key=lambda x: self.dateparser.parse(x['created_at']), reverse=False) except Exception as e: print 'error ' + str(e) finally: searcher.close() t2 = int(round(time.time() * 1000)) print '----> fetch tweets for the specified user costs ' + str(t2 - t1) + ' ms' return (tweets, users) def fetch_retweeting_behavior(self, uid): tweets = self.fetch_tweets_by_uid(uid) print tweets glyph = {} glyph['threads'] = [] glyph['users'] = [] temp = [] thread_tweets = [] for tweet in tweets: tid = tweet['id'] if 'retweeted_status' in tweet and tweet['retweeted_status'] is not None: tid = tweet['retweeted_status']['id'] thread_retweets = self.fetch_thread_by_tid(unicode(tid)) if(len(thread_retweets) == 0) : continue temp.append(tweet) thread_tweets.append(thread_retweets) tweets = temp behaviors = {} for i in range(len(tweets)): tweet = tweets[i] tid = tweet['id'] if 'retweeted_status' in tweet and tweet['retweeted_status'] is not None: tid = tweet['retweeted_status']['id'] thread = {} thread['content'] = [] for tt in thread_tweets[i]: u = tt['user'] if u['id'] not in behaviors: behaviors[u['id']] = {}; behaviors[u['id']]['id'] = u['id'] behaviors[u['id']]['screen_name'] = u['screen_name'] behaviors[u['id']]['followers_count'] = u['followers_count'] behaviors[u['id']]['profile_image_url'] = u['profile_image_url'] behaviors[u['id']]['behavior'] = [0] * len(tweets); behaviors[u['id']]['time'] = [''] * len(tweets); behaviors[u['id']]['sentiments'] = [0] * len(tweets); behaviors[u['id']]['rank'] = u['followers_count'] glyph['users'].append(behaviors[u['id']]) behaviors[u['id']]['behavior'][i] = 1 behaviors[u['id']]['time'][i] = tt['created_at'].strftime('%Y%m%d%H%M%S') behaviors[u['id']]['sentiments'][i] = sentiment(tweet['text']) thread['content'].append(behaviors[u['id']]) thread['name'] = tweet['text'] thread['sentiment'] = sentiment(tweet['text']) thread['start'] = thread_tweets[i][0]['created_at'].strftime('%Y%m%d%H%M%S') thread['end'] = thread_tweets[i][len(thread_tweets[i]) - 1]['created_at'].strftime('%Y%m%d%H%M%S') glyph['threads'].append(thread) for userid in behaviors: behaviors[userid]['sentiment'] = 1.0 * sum(behaviors[userid]['sentiments']) / len(behaviors[userid]['sentiments']) del behaviors[userid]['sentiments'] glyph['start'] = glyph['threads'][0]['start'] glyph['end'] = glyph['threads'][len(glyph['threads']) - 1]['start'] json.dump(glyph, open('./' + str(uid) + '.retweet' + '.json', 'wb')) return glyph def fetch_topic_behavior(self, uid): tags = {} tweets = self.fetch_tweets_by_uid(uid) for tweet in tweets: res = self.parser.parse(tweet['text']) if(len(res.tags) == 0): continue for tag in res.tags: if tag not in tags: tags[tag] = tweet['created_at'] else : if tags[tag] > tweet['created_at']: tags[tag] = tweet['created_at'] glyph = {} glyph['start'] = '201210040000' glyph['end'] = '201210040600' glyph['threads'] = [] glyph['users'] = [] # construct thread behaviors = {} tid = 0 for tag in tags: thread = {} thread['name'] = tag thread['start'] = tags[tag].strftime('%Y%m%d%H%M%S') (tweets, users) = self.fetch_tweets_by_keyword('#' + tag, thread['start'], 300) thread['content'] = [] for t in tweets: t['time'] = [''] * len(tags.keys()) t['time'][tid] = t['created_at'] thread['content'].append(t) thread['end'] = thread['content'][len(thread['content']) - 1]['created_at'] print thread['end'] glyph['threads'].append(thread) for u in users: if u['id'] not in behaviors: behaviors[u['id']] = {}; behaviors[u['id']]['id'] = u['id'] behaviors[u['id']]['screen_name'] = u['screen_name'] behaviors[u['id']]['followers_count'] = u['followers_count'] behaviors[u['id']]['behavior'] = [0] * len(tags.keys()); glyph['users'].append(behaviors[u['id']]) behaviors[u['id']]['behavior'][tid] = 1 tid += 1 glyph['start'] = glyph['threads'][0]['start'] glyph['end'] = glyph['threads'][len(glyph['threads']) - 1]['start'] json.dump(glyph, open('./' + str(uid) + '.topic' + '.json', 'wb'))
query_b = QueryParser('content', ix.schema).parse('cdk4') with ix.searcher() as srch: res_b = srch.search(query_b, limit=10) for i in res_b: print(i['name']) ix = open_dir("./indexdir1/") query_b = QueryParser('name', ix.schema).parse('NCT01692496') with ix.searcher() as srch: res_b = srch.search(query_b, limit=10) for i in res_b: print(i['name']) mparser = MultifieldParser(["Title","content"], schema=schema) mparser.add_plugin(wp.qparser.FuzzyTermPlugin) ix = open_dir("./indexdir1/") q = mparser.parse('"Colon" "cancer" "BRAF" "V600E"') out = [] with ix.searcher() as srch: res_b = srch.search(q, limit=100) for i in res_b: s = str(i['name'])[-11:] out.append(s) out eg = open('./topic_19.txt','w') pos = 1 score = 10
# coding=utf-8 from whoosh.index import open_dir from whoosh.qparser import MultifieldParser from whoosh.qparser import FuzzyTermPlugin idx_dir = 'lagou_idx' ix = open_dir(idx_dir) searcher = ix.searcher() parser = MultifieldParser(["name", "desc"], schema=ix.schema) parser.add_plugin(FuzzyTermPlugin()) # Single field parser. k = u'搜索 OR Pythn~2 city:上海' q = parser.parse(k) results = searcher.search_page(q, 1, pagelen=5) print(u'{0} results found for keyword {1}, {2} returned: '.format( len(results), k, results.scored_length())) for hit in results[:50]: print(hit['id']) print(hit['name']) # print(hit['city']) print(hit['com_name']) print('************')
class Query(object): def __init__(self, index_name): self.index_name = index_name self.ix = open_dir(index_name) self.indexer = Indexer("./template/cache/") self.multiParser = MultifieldParser(["anchor", "content"], schema=self.indexer.schema) self.singleParser = QueryParser("content", schema=self.indexer.schema) self.site_re = re.compile(r'site:') self.filetype_re = re.compile(r'filetype:') self.type2fn = { "website": self.query_website, "image": self.query_image, "file": self.query_file, "document": self.query_document, "all": self.query_all, } self.searcher = self.ix.searcher(weighting=TFIDF_PR) self.singleParser.add_plugin(EveryPlugin) self.multiParser.add_plugin(EveryPlugin) def query_website(self, sentence, page, filetype=None, site=None): query = And( [self.multiParser.parse(sentence), Term("type", "website")]) if filetype: query = And([query, Term("extension", filetype)]) if site: query = And([query, Prefix("url", site)]) results = self.searcher.search_page(query, page, terms=True) return results def query_image(self, sentence, page, filetype=None, site=None): query = And([self.singleParser.parse(sentence), Term("type", "image")]) if filetype: query = And([query, Term("extension", filetype)]) if site: query = And([query, Prefix("url", site)]) results = self.searcher.search_page(query, page, terms=True) return results def query_file(self, sentence, page, filetype=None, site=None): query = And([self.singleParser.parse(sentence), Term("type", "file")]) if filetype: query = And([query, Term("extension", filetype)]) if site: query = And([query, Prefix("url", site)]) results = self.searcher.search_page(query, page, terms=True) return results def query_document(self, sentence, page, filetype=None, site=None): query = And( [self.singleParser.parse(sentence), Term("type", "document")]) if filetype: query = And([query, Term("extension", filetype)]) if site: query = And([query, Prefix("url", site)]) results = self.searcher.search_page(query, page, terms=True) return results def query_all(self, sentence, page, filetype=None, site=None): query = self.multiParser.parse(sentence) if filetype: query = And([query, Term("extension", filetype)]) if site: query = And([query, Prefix("url", site)]) results = self.searcher.search_page(query, page, terms=True) return results def query(self, sentence, target, page): site = None filetype = None self.log(sentence) if self.site_re.match(sentence): sentence = sentence[5:].strip() site = sentence.split(' ')[0] sentence = sentence[len(site):].strip() if len(urlparse(site)[0]) == 0: site = "%s%s" % ("http://", site) elif self.filetype_re.match(sentence): sentence = sentence[9:].strip() filetype = sentence.split(' ')[0] sentence = sentence[len(filetype):].strip() return self.type2fn[target](sentence, page, filetype, site) def log(self, sentence): with open('logging.txt', 'a') as f: f.write(datetime.datetime.now().strftime("%y%m%d%H%M%S") + " " + sentence)
q_d = MultifieldParser(["title", "content", "extension", "url"], i_d.schema, group=og) q_e = MultifieldParser(["title", "content", "extension", "url"], i_e.schema, group=og) q_f = MultifieldParser(["title", "content", "extension", "url"], i_f.schema, group=og) elif operator == 4: #print ("in oper 4") og = qparser.OrGroup.factory(0.9) q_a = MultifieldParser(["title", "content", "tags", "extension", "url"], i_a.schema, group=og) q_a.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) q_a.add_plugin(qparser.FuzzyTermPlugin()) q_b = MultifieldParser(["title", "content", "extension", "url"], i_b.schema, group=og) q_b.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) q_b.add_plugin(qparser.FuzzyTermPlugin()) q_c = MultifieldParser(["title", "content", "extension", "url"], i_c.schema, group=og) q_c.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) q_c.add_plugin(qparser.FuzzyTermPlugin()) q_d = MultifieldParser(["title", "content", "extension", "url", "url"], i_d.schema, group=og) q_d.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?"))