async def fetchArticlBody(*, projectName: str = Path(...), urlItem: str, word: str): # 查询 项目数据库,articles 表中数据 # projectName 转 projectId projectId = await findProjectIdFromProjectName( dbPrefix, 'Project', queryDict={'projectName': projectName}, showDict={'_id': 1}) if not projectId: raise HTTPException(status_code=503, detail='projectNotExist') # 页码起始 start = 0 end = 0 # 带搜索的 es索引 (等价于 mongo中的 数据库) _index = f'kwm-{projectId}.articles'.lower() #print('_index', _index) s = Search() q1 = Q("match_phrase", url=f"\"{urlItem}\"") # url 匹配 q2 = Q('match_phrase', body=f"\"{word}\"") # word 匹配 s = s.query(q1) s = s.query(q2) s = s.source(includes=['']) # 不返回输出 s = s.highlight_options(order='score') s = s.highlight_options( pre_tags="<strong style=\"background: yellow;color: red\">") s = s.highlight_options(post_tags="</strong>") s = s.highlight_options(fragment_size=300) # s = s.highlight('body') s = s[0:10000] # common setting #print(s.to_dict()) # 执行 response = await esRun(s.to_dict(), _index) #s.execute(ignore_cache=True) #totalCount = response.hits.total.value temp = response.to_dict()['hits']['hits'] result = [] for item in temp: tt = {'_id': {'$oid': item['_id']}} tt.update(item['_source']) if item.get('highlight'): tt.update({'highlight': item['highlight']}) if start >= 0 and end > 0: tt.update({'id': start + 1}) result.append(tt) start = start + 1 return (result)
def elasticsearch_pages(context, sort, page): result_limit = int(os.environ['RESULT_LIMIT']) max_result_limit = int(os.environ['MAX_RESULT_LIMIT']) start = (page - 1) * result_limit end = start + result_limit domain_query = Q("term", is_banned=False) if context["is_up"]: domain_query = domain_query & Q("term", is_up=True) if not context["show_fh_default"]: domain_query = domain_query & Q("term", is_crap=False) if not context["show_subdomains"]: domain_query = domain_query & Q("term", is_subdomain=False) if context["rep"] == "genuine": domain_query = domain_query & Q("term", is_genuine=True) if context["rep"] == "fake": domain_query = domain_query & Q("term", is_fake=True) limit = max_result_limit if context["more"] else result_limit has_parent_query = Q("has_parent", type="domain", query=domain_query) query = Search().filter(has_parent_query).query( Q("match", body_stripped=context['search'])) query = query.highlight_options( order='score', encoder='html').highlight('body_stripped')[start:end] query = query.source(['title', 'domain_id', 'created_at', 'visited_at']).params(request_cache=True) return query.execute()
def GetAuditDataMain(self, data): s = Search() s = s[0:1000] s = s.highlight('*') s = s.highlight_options(require_field_match=False) t = Q('query_string', query=data) & ~Q('query_string', default_field="AuditType.Generator", query="stateagentinspector") & ~Q('query_string', default_field="AuditType.Generator", query="w32processes-tree") query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] try: for x in r.json()['hits']['hits']: for y, v in x['highlight'].iteritems(): data.append({ "doc_id": x['_id'], "endpoint": x['_parent'], "audittype": x['_source']['AuditType']['Generator'], "field": y, "response": v }) except KeyError: pass return data
def get(self, request, *args, **kwargs): query = self.request.query_params.get('query') country = self.request.query_params.get('country') points = self.request.query_params.get('points') search = Search(index=constants.ES_INDEX) q = {'should': [], 'filter': []} if query: q['should'] = [ Match(variety={ 'query': query, 'boost': 3.0 }), Match(winery={ 'query': query, 'boost': 2.0 }), Match(description={ 'query': query, 'boost': 1.0 }) ] q['minimum_should_match'] = 1 search = search.highlight_options(number_of_fragments=0, pre_tags=['<mark>'], post_tags=['</mark>']) search = search.highlight('variety', 'winery', 'description') if country: q['filter'].append(Term(country=country)) if points: q['filter'].append(Term(points=points)) response = search.query('bool', **q).params(size=100).execute() if response.hits.total.value > 0: return Response(data=[{ 'id': hit.meta.id, 'country': hit.country, 'description': (hit.meta.highlight.description[0] if 'highlight' in hit.meta and 'description' in hit.meta.highlight else hit.description), 'points': hit.points, 'price': hit.price, 'variety': ( hit.meta.highlight.variety[0] if 'highlight' in hit.meta and 'variety' in hit.meta.highlight else hit.variety), 'winery': ( hit.meta.highlight.winery[0] if 'highlight' in hit.meta and 'winery' in hit.meta.highlight else hit.winery) } for hit in response]) else: return Response(data=[])
def search_close(self, origin_timestamp, channel, qterm, number_results): """ Find log entries close to origin timestamp, filter by channel, highlight qterm and return them sorted by date. :param origin_timestamp: origin timestamp to find logs around :param channel: Channel to be filtered :param qterm: Term to be highlighted :param number_results: how many results :return: List of sorted log entries (Elastic-search response) :rtype: ``list`` """ # Prepare query s = DslSearch(using=self._es, index=self._index_prefix.format('*')) # Function score main_query_boosting = 1e-15 # only used for highlighting, not for scoring -> give very low signifance pos = MatchPhrase(msg={'query': qterm, 'boost': main_query_boosting}) | \ Match(**{'username': {'query': qterm, 'boost': main_query_boosting}}) | \ Match(channel={'query': qterm, 'boost': main_query_boosting}) | \ Match(msg={'query': qterm, 'boost': main_query_boosting}) main_query = (pos | Q('match_all')) function_score_query = Q('function_score', query=main_query, functions=[ SF( 'exp', **{ '@timestamp': { "origin": origin_timestamp, "scale": "1m", "decay": 0.999 } }) ]) s = s.query(function_score_query) # filter channel s = s.filter('term', **{'channel.keyword': channel}) # Number of results s = s[0:number_results] # Highlight s = s.highlight_options(order='score') s = s.highlight('msg', number_of_fragments=0) s = s.highlight('username') s = s.highlight('channel') # Execute response = s.execute() # Sort results response_sorted = sorted(response, key=lambda hit: hit['@timestamp']) return response_sorted
def portalSearch(expression, start=0, end=25): client = Elasticsearch() ret = {'nodes': [], 'Counts': {}} q = Q("bool", must=[Q('match', _all=expression)]) s = Search(using=client, index="neo4j-inquisite-node", doc_type="Repository,Data").query(q) q_total = s.count() s = s[0:q_total] s = s.highlight_options(require_field_match=False) s = s.highlight('*', fragment_size=45) res = s.execute() data = {} uuids = [] pub_uuids = {} if res: for r in res: d = r.to_dict() if r.meta.doc_type == 'Repository': if int(d['published']) == 0: continue repo_id = r.meta.id ret['nodes'].append({ "id": r.meta.id, "type": "Repository", "name": d['name'], "description": d['readme'] }) repo_uuids = SearchManager._getDataUUIDsForRepo(repo_id) pub_uuids[repo_id] = repo_uuids else: hits = [] highs = r.meta.highlight.to_dict() for high_field, high_value in highs.items(): hits.append({high_field: high_value}) data[r.meta.id] = {'id': r.meta.id, "hits": hits} uuids.append(r.meta.id) qString = "MATCH (r:Repository)--(t:SchemaType)--(d:Data) WHERE d.uuid IN {uuids} AND r.published = '1' RETURN d.uuid as uuid, r.name as repo_name, r.uuid as repo_id" pub_data = db.run(qString, {"uuids": uuids}) data_max = 0 for checked in pub_data: if data_max >= 32: break ret['nodes'].append({ "id": checked['uuid'], "type": "Data", "repo_id": checked['repo_id'], "repo_name": checked['repo_name'], "hits": data[checked['uuid']]['hits'] }) data_max += 1 return ret else: return ret
def match_phrase_in_text(phrase): s = Search(using=client, index="sample_film_index") q = Q('match_phrase', text=phrase) s = s.query(q) s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>') # for html s = s.highlight('text', fragment_size=999999999, number_of_fragments=1) response = s.execute() print "Num hits for", phrase, len(response.to_dict()['hits']['hits']) for hit in response: print hit.meta.score #doc score print hit.meta.highlight #highlighted snippet
def search_content(keyword, limit=50): client = Elasticsearch() q = Q("multi_match", query=keyword, fields=['title', 'content']) s = Search(using=client) # s = Search(using=client, index="pet-index").query("match", content="金毛") s = Search(using=client, index="pet-index").query(q) s = s[0:limit] s = s.highlight_options(order='score') s = s.highlight('content') response = s.execute() return response
def free_search_in_title(word): s = Search(using=client, index="sample_film_index") # Q is a shortcut for constructing a query object q = Q('match', title=word) # At some point, q has to be added to the search object. s = s.query(q) s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>') # for html s = s.highlight('title', word, fragment_size=999999999, number_of_fragments=1) response = s.execute() print "Num hits for", word, len(response.to_dict()['hits']['hits']) for hit in response: print hit.meta.score #doc score print hit.meta.highlight #highlighted snippet
def get_queryset(self): if not self.index_manager.connected_to_es: messages.warning(self.request, _(u'Impossible de se connecter à Elasticsearch')) return [] if self.search_query: # find forums the user is allowed to visit self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() # setting the different querysets (according to the selected models, if any) part_querysets = [] chosen_groups = self.search_form.cleaned_data['models'] if chosen_groups: models = [] for group in chosen_groups: if group in settings.ZDS_APP['search']['search_groups']: models.append(settings.ZDS_APP['search']['search_groups'][group][1]) else: models = [v[1] for k, v in settings.ZDS_APP['search']['search_groups'].iteritems()] models = reduce(operator.concat, models) for model in models: part_querysets.append(getattr(self, 'get_queryset_{}s'.format(model))()) queryset = part_querysets[0] for query in part_querysets[1:]: queryset |= query # weighting: weight_functions = [] for _type, weights in settings.ZDS_APP['search']['boosts'].items(): if _type in models: weight_functions.append({'filter': Match(_type=_type), 'weight': weights['global']}) scored_queryset = FunctionScore(query=queryset, boost_mode='multiply', functions=weight_functions) search_queryset = search_queryset.query(scored_queryset) # highlighting: search_queryset = search_queryset.highlight_options( fragment_size=150, number_of_fragments=5, pre_tags=['[hl]'], post_tags=['[/hl]']) search_queryset = search_queryset.highlight('text').highlight('text_html') # executing: return self.index_manager.setup_search(search_queryset) return []
def highlight(search: Search) -> Search: """ Apply hit highlighting to the search, before execution. Parameters ---------- search : :class:`.Search` Returns ------- :class:`.Search` The search object that was originally passed, updated to include requests for hit highlighting. """ # Highlight class .search-hit defined in search.sass search = search.highlight_options(pre_tags=[HIGHLIGHT_TAG_OPEN], post_tags=[HIGHLIGHT_TAG_CLOSE]) search = search.highlight('title', type='plain', number_of_fragments=0) search = search.highlight('title.english', type='plain', number_of_fragments=0) search = search.highlight('title.tex', type='plain', number_of_fragments=0) search = search.highlight('comments', number_of_fragments=0) # Highlight any field the name of which begins with "author". search = search.highlight('author*') search = search.highlight('owner*') search = search.highlight('submitter*') search = search.highlight('journal_ref', type='plain') search = search.highlight('acm_class', number_of_fragments=0) search = search.highlight('msc_class', number_of_fragments=0) search = search.highlight('doi', type='plain') search = search.highlight('report_num', type='plain') # Setting number_of_fragments to 0 tells ES to highlight the entire # abstract. search = search.highlight('abstract', type='plain', number_of_fragments=0) search = search.highlight('abstract.tex', type='plain', number_of_fragments=0) search = search.highlight('abstract.english', type='plain', number_of_fragments=0) search = search.highlight('primary_classification*', type='plain', number_of_fragments=0) return search
def find(query, company_id, proposal_id): client = get_client() index = current_app.config["ES_IMPORT_INDEX"] s = Search(using=client, index=index) s = s.filter("term", company_id=company_id) # s = s.filter(~Q("term", proposal_id=proposal_id)) # Weighting title more than the content since a user writing an exact title # should yield that section rather than the same query in a content s = s.query(Q("multi_match", query=query, fields=["title^4", "content"])) s = s.highlight_options(order="score", pre_tags=["<span class='search-highlight'>"], post_tags=["</span>"]) s = s.highlight("title", "content") # Only get the first 20 results response = s[:20].execute() return response.hits
def GetAuditDataMain(self, data): s = Search() s = s[0:1000] s = s.highlight('*') s = s.highlight_options(require_field_match=False) t = Q('query_string', query=data) & ~Q('query_string', default_field="AuditType.Generator", query="stateagentinspector") & ~Q( 'query_string', default_field="AuditType.Generator", query="w32processes-tree") query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] try: for x in r.json()['hits']['hits']: for y, v in x['highlight'].iteritems(): data.append({ "doc_id": x['_id'], "endpoint": x['_parent'], "audittype": x['_source']['AuditType']['Generator'], "field": y, "response": v }) except KeyError: pass return data
async def get(self): """Get the results from Elasticsearch.""" q = self.request.query.get("q") if not q: return web.json_response([]) es = Elasticsearch( hosts=[self.request.app["settings"].ELASTICSEARCH_URL], timeout=ELASTICSEARCH_TIMEOUT, verify_certs=ELASTICSEARCH_VERIFY_CERTS, ) mapping = es.indices.get_mapping(ELASTICSEARCH_INDEX, include_type_name=True) search = Search(index=ELASTICSEARCH_INDEX, using=es) search = search.highlight_options( pre_tags=[PRE_HIGHLIGHT_TAG], post_tags=[POST_HIGHLIGHT_TAG], ) query = self.queries(mapping, q) search = search.query(query) highlights = self.build_highlight( mapping[ELASTICSEARCH_INDEX]["mappings"]["_doc"]["properties"]) for highlight in highlights: search = search.highlight(highlight, type="plain") search = search.extra( from_=0, size=MAX_RESULTS, ) values = [] for hit in search.execute(): hit._d_.pop(META, None) if HIGHLIGHT and hasattr(hit.meta, "highlight"): highlight = hit.meta.highlight query = DictQuery(hit._d_) for key in highlight: path = key.split(".")[:-1] value = highlight[key][0] query.set("/".join(path), value) values.append(query) else: values.append(hit._d_) return web.json_response(values)
def highlight(search: Search) -> Search: """ Apply hit highlighting to the search, before execution. Parameters ---------- search : :class:`.Search` Returns ------- :class:`.Search` The search object that was originally passed, updated to include requests for hit highlighting. """ # Highlight class .search-hit defined in search.sass search = search.highlight_options(pre_tags=[HIGHLIGHT_TAG_OPEN], post_tags=[HIGHLIGHT_TAG_CLOSE]) search = search.highlight("title", type="plain", number_of_fragments=0) search = search.highlight("title.english", type="plain", number_of_fragments=0) search = search.highlight("title.tex", type="plain", number_of_fragments=0) search = search.highlight("comments", number_of_fragments=0) # Highlight any field the name of which begins with "author". search = search.highlight("author*") search = search.highlight("owner*") search = search.highlight("announced_date_first") search = search.highlight("submitter*") search = search.highlight("journal_ref", type="plain") search = search.highlight("acm_class", number_of_fragments=0) search = search.highlight("msc_class", number_of_fragments=0) search = search.highlight("doi", type="plain") search = search.highlight("report_num", type="plain") # Setting number_of_fragments to 0 tells ES to highlight the entire field. search = search.highlight("abstract", number_of_fragments=0) search = search.highlight("abstract.tex", type="plain", number_of_fragments=0) search = search.highlight("abstract.english", number_of_fragments=0) return search
def portalSearch(expression, start=0, end=25): client = Elasticsearch() ret = {'nodes': [], 'Counts': {}} q = Q("bool", must=[Q('match', _all=expression)]) s = Search(using=client, index="neo4j-inquisite-node", doc_type="Repository,Data").query(q) q_total = s.count() s = s[0:q_total] s = s.highlight_options(require_field_match=False) s = s.highlight('*', fragment_size=45) res = s.execute() data = {} uuids = [] pub_uuids = {} if res: for r in res: d = r.to_dict() if r.meta.doc_type == 'Repository': if int(d['published']) == 0: continue repo_id = r.meta.id ret['nodes'].append({"id": r.meta.id, "type": "Repository", "name": d['name'], "description": d['readme']}) repo_uuids = SearchManager._getDataUUIDsForRepo(repo_id) pub_uuids[repo_id] = repo_uuids else: hits = [] highs = r.meta.highlight.to_dict() for high_field,high_value in highs.items(): hits.append({high_field: high_value}) data[r.meta.id] = {'id': r.meta.id, "hits": hits} uuids.append(r.meta.id) qString = "MATCH (r:Repository)--(t:SchemaType)--(d:Data) WHERE d.uuid IN {uuids} AND r.published = '1' RETURN d.uuid as uuid, r.name as repo_name, r.uuid as repo_id" pub_data = db.run(qString, {"uuids": uuids}) data_max = 0 for checked in pub_data: if data_max >= 32: break; ret['nodes'].append({"id": checked['uuid'], "type": "Data", "repo_id": checked['repo_id'], "repo_name": checked['repo_name'], "hits": data[checked['uuid']]['hits']}) data_max += 1 return ret else: return ret
def search_by_keywords(self, keywords, subject): search = Search(using=self.es, index='arxiv-index') query_content = Q() keywords = re.sub('[^A-Za-z0-9 ]+', '', keywords).lower() for keyword in keywords.split(' '): query_content = query_content + \ (Q('wildcard', pdf='*' + keyword + '*') | \ Q('wildcard', abstract='*' + keyword + '*') | \ Q('wildcard', authors='*' + keyword + '*')) query_subject = Q() query_other = Q() if subject and subject != 'all': query_subject = Q('wildcard', subject='*' + subject + '.*') query_other = Q('wildcard', other_subjects='*' + subject + '.*') final_query = Q('bool', must=[query_content], should=[query_subject, query_other], minimum_should_match=1) search = search.query(final_query) search = search.source([ 'title', 'authors', 'subject', 'other_subjects', 'abstract', 'abstract_url', 'pdf_url', 'submit_date' ]) search = search.highlight_options(order='score') search = search.highlight('abstract', fragment_size=400) total = search.count() search = search[0:total] search = self._extend_query(search, keywords) request = search.execute() for hit in request: response = hit.to_dict() if 'highlight' in hit.meta: response.update({'fragment': hit.meta.highlight.abstract}) else: response.update({'fragment': []}) yield response
def esearch(username="", gender="", address="", email="", photo=""): client = Elasticsearch() q = Q("bool", should=[ Q("match", username=username), Q("match", photo=photo), Q("match", address=address), Q("match", email=email), Q("match", gender=gender) ], minimum_should_match=1) s = Search(using=client, index="users").query(q) s = s.highlight_options(order='score', require_field_match='false', fields={ "*": { "pre_tags": ["<font color='red'>"], "post_tags": ["</font>"] } }) # s = s.highlight('username', fragment_size=50) response = s.execute() search = get_results(response) return search
def get_queryset(self): if not self.index_manager.connected_to_es: messages.warning(self.request, _('Impossible de se connecter à Elasticsearch')) return [] if self.search_query: # Searches forums the user is allowed to visit self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() # Restrict (sub)category if any if self.search_form.cleaned_data['category']: self.content_category = self.search_form.cleaned_data['category'] if self.search_form.cleaned_data['subcategory']: self.content_subcategory = self.search_form.cleaned_data['subcategory'] # Mark that contents must come from library if required self.from_library = False if self.search_form.cleaned_data['from_library'] == 'on': self.from_library = True # Setting the different querysets (according to the selected models, if any) part_querysets = [] chosen_groups = self.search_form.cleaned_data['models'] if chosen_groups: models = [] for group in chosen_groups: if group in settings.ZDS_APP['search']['search_groups']: models.append(settings.ZDS_APP['search']['search_groups'][group][1]) else: models = [v[1] for k, v in settings.ZDS_APP['search']['search_groups'].items()] models = reduce(operator.concat, models) for model in models: part_querysets.append(getattr(self, 'get_queryset_{}s'.format(model))()) queryset = part_querysets[0] for query in part_querysets[1:]: queryset |= query # Weighting: weight_functions = [] for _type, weights in list(settings.ZDS_APP['search']['boosts'].items()): if _type in models: weight_functions.append({'filter': Match(_type=_type), 'weight': weights['global']}) scored_queryset = FunctionScore(query=queryset, boost_mode='multiply', functions=weight_functions) search_queryset = search_queryset.query(scored_queryset) # Highlighting: search_queryset = search_queryset.highlight_options( fragment_size=150, number_of_fragments=5, pre_tags=['[hl]'], post_tags=['[/hl]']) search_queryset = search_queryset.highlight('text').highlight('text_html') # Executing: return self.index_manager.setup_search(search_queryset) return []
def get_queryset(self): if not self.index_manager.connected_to_es: messages.warning(self.request, _("Impossible de se connecter à Elasticsearch")) return [] if self.search_query: # Searches forums the user is allowed to visit self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() # Restrict (sub)category if any if self.search_form.cleaned_data["category"]: self.content_category = self.search_form.cleaned_data[ "category"] if self.search_form.cleaned_data["subcategory"]: self.content_subcategory = self.search_form.cleaned_data[ "subcategory"] # Mark that contents must come from library if required self.from_library = False if self.search_form.cleaned_data["from_library"] == "on": self.from_library = True # Setting the different querysets (according to the selected models, if any) part_querysets = [] chosen_groups = self.search_form.cleaned_data["models"] if chosen_groups: models = [] for group in chosen_groups: if group in settings.ZDS_APP["search"]["search_groups"]: models.append(settings.ZDS_APP["search"] ["search_groups"][group][1]) else: models = [ v[1] for k, v in settings.ZDS_APP["search"] ["search_groups"].items() ] models = reduce(operator.concat, models) for model in models: part_querysets.append( getattr(self, f"get_queryset_{model}s")()) queryset = part_querysets[0] for query in part_querysets[1:]: queryset |= query # Weighting: weight_functions = [] for _type, weights in list( settings.ZDS_APP["search"]["boosts"].items()): if _type in models: weight_functions.append({ "filter": Match(_type=_type), "weight": weights["global"] }) scored_queryset = FunctionScore(query=queryset, boost_mode="multiply", functions=weight_functions) search_queryset = search_queryset.query(scored_queryset) # Highlighting: search_queryset = search_queryset.highlight_options( fragment_size=150, number_of_fragments=5, pre_tags=["[hl]"], post_tags=["[/hl]"]) search_queryset = search_queryset.highlight("text").highlight( "text_html") # Executing: return self.index_manager.setup_search(search_queryset) return []
def search_keyword(self, keyword, doc_filter=None, size=10): ''' Create the search object and get the number of hits. ''' s = Search(index='lucid').using(self.client) print doc_filter if 'divtype' in doc_filter: for i, types in enumerate(doc_filter['divtype']): if i == 0: filt = Q("match", divtype=types) else: filt = filt | Q("match", divtype=types) s = s.filter(filt) n_hits = s.count() if 'docsource' in doc_filter: for i, types in enumerate(doc_filter['docsource']): if i == 0: filt = Q("match", docsource=types) else: filt = filt | Q("match", docsource=types) s = s.filter(filt) flag = 0 if 'end' in doc_filter: flag = 1 end_year = datetime.datetime(int(doc_filter['end']), 12, 31) else: end_year = datetime.datetime.now() if 'start' in doc_filter: flag = 0 start_year = datetime.datetime(int(doc_filter['start']), 1, 1) s = s.filter('range', publishdate={ 'gte': start_year, 'lte': end_year }) if flag: s = s.filter('range', publishdate={'lte': end_year}) # the search object. -p indicates sort by order=desc on p # --------------------------------------query------------------------------------------------------- q1 = Q("multi_match", query=keyword, fields=["title", "keywords", "doc"], type="best_fields", cutoff_frequency=0.0007, operator="and", fuzziness="AUTO") q2 = Q("multi_match", query=keyword, fields=["title", "keywords", "doc"], type="phrase") q3 = Q("bool", must=[q1], should=[q2]) s = s.query(q3) s = s.suggest("didYouMean", keyword, phrase={'field': 'did_you_mean'}) s = s.highlight_options(order="score", pre_tags=["<mark>"], post_tags=["</mark>"], fragment_size=80, no_match_size=0) s = s.highlight('title', number_of_fragments=0) s = s.highlight('keywords', number_of_fragments=10) s = s.highlight('doc', number_of_fragments=10) # --------------------------------------------------------------------------------------------------- n_hits = s.count() print "hits = ", n_hits hits_start = 0 return s, n_hits
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Apply term filters. Each tuple pairs a filter's parameter name in the API # with its corresponding field in Elasticsearch. "None" means that the # names are identical. filters = [('extension', None), ('categories', None), ('aspect_ratio', None), ('size', None), ('source', 'provider'), ('license', 'license__keyword'), ('license_type', 'license__keyword')] for tup in filters: api_field, elasticsearch_field = tup s = _apply_filter(s, search_params, api_field, elasticsearch_field) # Get suggestions for any route s = s.suggest('get_suggestion', '', term={'field': 'creator'}) # Exclude mature content unless explicitly enabled by the requester if not search_params.data['mature']: s = s.exclude('term', mature=True) # Hide data sources from the catalog dynamically. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = models.ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) to_exclude = [f['provider_identifier'] for f in filtered_providers] s = s.exclude('terms', provider=to_exclude) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query('simple_query_string', query=query, fields=search_fields) # Get suggestions for term query s = s.suggest('get_suggestion', query, term={'field': 'creator'}) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query('simple_query_string', query=creator, fields=['creator']) # Get suggestions for creator s = s.suggest('get_suggestion', creator, term={'field': 'creator'}) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query('simple_query_string', query=title, fields=['title']) # Get suggestions for title s = s.suggest('get_suggestion', title, term={'field': 'title'}) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query('simple_query_string', fields=['tags.name'], query=tags) # Get suggestions for tags s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'}) # Boost by popularity metrics if POPULARITY_BOOST: queries = [] factors = ['comments', 'views', 'likes'] boost_factor = 100 / len(factors) for factor in factors: rank_feature_query = Q('rank_feature', field=factor, boost=boost_factor) queries.append(rank_feature_query) s = Search().query( Q('bool', must=s.query, should=queries, minimum_should_match=1)) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip), request_timeout=7) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] try: search_response = s.execute() log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}') except RequestError as e: raise ValueError(e) results = _post_process_results(s, start, end, page_size, search_response, request, filter_dead) suggestion = _query_suggestions(search_response) result_count, page_count = _get_result_and_page_count( search_response, results, page_size) return results, page_count, result_count, suggestion
def get(self, q, from_hit=0, hits_returned=20, type='all', **kwargs): if type == 'all': types = ['statutes', 'regulations', 'advisory_opinions', 'murs'] else: types = [type] parsed_query = parse_query_string(q) terms = parsed_query.get('terms') phrases = parsed_query.get('phrases') hits_returned = min([200, hits_returned]) results = {} total_count = 0 for type in types: must_query = [Q('term', _type=type)] text_highlight_query = Q() if len(terms): term_query = Q('match', _all=' '.join(terms)) must_query.append(term_query) text_highlight_query = text_highlight_query & term_query if len(phrases): phrase_queries = [Q('match_phrase', _all=phrase) for phrase in phrases] must_query.extend(phrase_queries) text_highlight_query = text_highlight_query & Q('bool', must=phrase_queries) query = Search().using(es) \ .query(Q('bool', must=must_query, should=[Q('match', no=q), Q('match_phrase', _all={"query": q, "slop": 50})])) \ .highlight('description', 'name', 'no', 'summary', 'text') \ .source(exclude='text') \ .extra(size=hits_returned, from_=from_hit) \ .index('docs') if type == 'advisory_opinions': query = query.query("match", category="Final Opinion") if text_highlight_query: query = query.highlight_options(highlight_query=text_highlight_query.to_dict()) es_results = query.execute() formatted_hits = [] for hit in es_results: formatted_hit = hit.to_dict() formatted_hit['highlights'] = [] formatted_hits.append(formatted_hit) if 'highlight' in hit.meta: for key in hit.meta.highlight: formatted_hit['highlights'].extend(hit.meta.highlight[key]) count = es_results.hits.total total_count += count results[type] = formatted_hits results['total_%s' % type] = count results['total_all'] = total_count return results
def results(page): global tmp_text global tmp_title global tmp_star global tmp_min global tmp_max global tmp_director global tmp_lan global tmp_country global tmp_loc global tmp_minyear global tmp_maxyear global tmp_cats global gresults # convert the <page> parameter in url to integer. if type(page) is not int: page = int(page.encode('utf-8')) # if the method of request is post (for initial query), store query in local global variables # if the method of request is get (for "next" results), extract query contents from client's global variables if request.method == 'POST': # if has query, strip() all whitespace text_query = request.form['query'].strip() star_query = request.form['starring'].strip() mintime_query = request.form['mintime'].strip() if len(mintime_query) != 0: mintime_query = int(mintime_query) maxtime_query = request.form['maxtime'].strip() if len(maxtime_query) != 0: maxtime_query = int(maxtime_query) director_query = request.form['director'].strip() lan_query = request.form['language'].strip() country_query = request.form['country'].strip() loc_query = request.form['location'].strip() minyear_query = request.form['minplottime'].strip() if len(minyear_query) != 0: minyear_query = int(minyear_query) maxyear_query = request.form['maxplottime'].strip() if len(maxyear_query) != 0: maxyear_query = int(maxyear_query) cats_query = request.form['categories'].strip() # update global variable template data tmp_text = text_query tmp_star = star_query tmp_min = mintime_query tmp_max = maxtime_query tmp_director = director_query tmp_lan = lan_query tmp_country = country_query tmp_loc = loc_query tmp_minyear = minyear_query tmp_maxyear = maxyear_query tmp_cats = cats_query else: # use the current values stored in global variables. text_query = tmp_text star_query = tmp_star mintime_query = tmp_min maxtime_query = tmp_max director_query = tmp_director lan_query = tmp_lan country_query = tmp_country loc_query = tmp_loc minyear_query = tmp_minyear maxyear_query = tmp_maxyear cats_query = tmp_cats # store query values to display in search boxes in UI shows = {} shows['text'] = text_query shows['star'] = star_query shows['maxtime'] = maxtime_query shows['mintime'] = mintime_query shows['director'] = director_query shows['lan'] = lan_query shows['country'] = country_query shows['loc'] = loc_query shows['minyear'] = minyear_query shows['maxyear'] = maxyear_query shows['cats'] = cats_query # keep a copy of original text query, in case cull out explicit phrases later full_text_query = text_query # Create a search object to query our index s = Search(index=index_name) # Build up your elasticsearch query in piecemeal fashion based on the user's parameters passed in. # The search API is "chainable". # Each call to search.query method adds criteria to our growing elasticsearch query. # You will change this section based on how you want to process the query data input into your interface. # set flag to default to indicate all terms have been matched all_matched = True # compile a Regex pattern to extract explicit phrases enclosed by "" pattern = re.compile(r'(?:\B\")(.*?)(?:\b\")') phrases = pattern.findall(text_query) # get the rest free terms text_query = pattern.sub('', text_query).strip() # First doing conjunctive search over multiple fields (title and text) using the text_query and phrases passed in if len(text_query) + len(phrases) > 0: # save deep copies for disjunctive search later tmp_s = s.__copy__() tmp_phrases = phrases.copy() # conjunctive search for text_query AND phrases, with boosted field weight if len(text_query) > 0: s = s.query('multi_match', query=text_query, type='cross_fields', fields=['title^2', 'text'], operator='and') while len(phrases) > 0: s = s.query('multi_match', query=phrases.pop(), type='phrase_prefix', fields=['title^2', 'text']) # if conjunctive search has no result, doing disjunctive ( text_query OR phrases ) if s.count() == 0: # indicate not all terms are matched all_matched = False if len(text_query) > 0: q = Q('multi_match', query=text_query, type='cross_fields', fields=['title^2', 'text'], operator='or') else: q = Q('multi_match', query=tmp_phrases.pop(), type='phrase_prefix', fields=['title^2', 'text']) while len(tmp_phrases) > 0: q |= Q('multi_match', query=tmp_phrases.pop(), type='phrase_prefix', fields=['title^2', 'text']) s = tmp_s.query(q) # search for multiple fields using chained query (AND) if len(mintime_query) > 0: s = s.query('range', runtime={'gte': mintime_query}) if len(maxtime_query) > 0: s = s.query('range', runtime={'lte': maxtime_query}) if len(minyear_query) > 0: s = s.query('range', runtime={'gte': minyear_query}) if len(maxyear_query) > 0: s = s.query('range', runtime={'lte': maxyear_query}) if len(star_query) > 0: s = s.query('match', starring=star_query) if len(director_query) > 0: s = s.query('match', director=director_query) if len(lan_query) > 0: s = s.query('match', language=lan_query) if len(country_query) > 0: s = s.query('match', country=country_query) if len(loc_query) > 0: s = s.query('match', location=loc_query) if len(cats_query) > 0: s = s.query('match', categories=cats_query) # highlight s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>') s = s.highlight('text', fragment_size=999999999, number_of_fragments=1) s = s.highlight('title', fragment_size=999999999, number_of_fragments=1) s = s.highlight('starring', fragment_size=999999999, number_of_fragments=1) s = s.highlight('director', fragment_size=999999999, number_of_fragments=1) s = s.highlight('language', fragment_size=999999999, number_of_fragments=1) s = s.highlight('country', fragment_size=999999999, number_of_fragments=1) s = s.highlight('location', fragment_size=999999999, number_of_fragments=1) s = s.highlight('categories', fragment_size=999999999, number_of_fragments=1) # determine the subset of results to display (based on current <page> value) start = 0 + (page - 1) * 10 end = 10 + (page - 1) * 10 # execute search and return results in specified range. response = s[start:end].execute() # insert data into response resultList = {} for hit in response.hits: result = {} result['score'] = hit.meta.score if 'highlight' in hit.meta: if 'title' in hit.meta.highlight: result['title'] = hit.meta.highlight.title[0] else: result['title'] = hit.title if 'text' in hit.meta.highlight: result['text'] = hit.meta.highlight.text[0] else: result['text'] = hit.text if 'starring' in hit.meta.highlight: result['starring'] = hit.meta.highlight.starring[0] else: result['starring'] = hit.starring if 'director' in hit.meta.highlight: result['director'] = hit.meta.highlight.director[0] else: result['director'] = hit.director if 'language' in hit.meta.highlight: result['language'] = hit.meta.highlight.language[0] else: result['language'] = hit.language if 'country' in hit.meta.highlight: result['country'] = hit.meta.highlight.country[0] else: result['country'] = hit.country if 'location' in hit.meta.highlight: result['location'] = hit.meta.highlight.location[0] else: result['location'] = hit.location if 'categories' in hit.meta.highlight: result['categories'] = hit.meta.highlight.categories[0] else: result['categories'] = hit.categories else: result['title'] = hit.title result['text'] = hit.text result['starring'] = hit.starring result['director'] = hit.director result['language'] = hit.language result['country'] = hit.country result['location'] = hit.location result['categories'] = hit.categories resultList[hit.meta.id] = result # make the result list available globally gresults = resultList # get the total number of matching results result_num = response.hits.total # if we find the results, extract title and text information from doc_data, else do nothing if result_num > 0: return render_template('page_SERP.html', results=resultList, res_num=result_num, page_num=page, queries=shows, all_matched=all_matched) else: message = [] if len(full_text_query) > 0: message.append('Unknown search term: ' + full_text_query) if len(star_query) > 0: message.append('Cannot find star: ' + star_query) if len(director_query) > 0: message.append('Cannot find director: ' + director_query) if len(lan_query) > 0: message.append('Cannot find language: ' + lan_query) if len(country_query) > 0: message.append('Cannot find country: ' + country_query) if len(loc_query) > 0: message.append('Cannot find location: ' + loc_query) if len(cats_query) > 0: message.append('Cannot find categories: ' + cats_query) return render_template('page_SERP.html', results=message, res_num=result_num, page_num=page, queries=shows)
def search_tblescalation_symptoms(value): es = Elasticsearch() query = Search(using=es, index="tblescalation-index").query("match", symptoms=value) s = query.highlight_options(order='score') response = s.execute() return response
def search_elastic(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, per_page=75, max_search_results=1000): # This function can easily be memcached now if page > 4294967295: flask.abort(404) es_client = Elasticsearch() es_sort_keys = { 'id': 'id', 'size': 'filesize', # 'name': 'display_name', # This is slow and buggy 'comments': 'comment_count', 'seeders': 'seed_count', 'leechers': 'leech_count', 'downloads': 'download_count' } sort_ = sort.lower() if sort_ not in es_sort_keys: flask.abort(400) es_sort = es_sort_keys[sort] order_keys = {'desc': 'desc', 'asc': 'asc'} order_ = order.lower() if order_ not in order_keys: flask.abort(400) # Only allow ID, desc if RSS if rss: sort = es_sort_keys['id'] order = 'desc' # funky, es sort is default asc, prefixed by '-' if desc if 'desc' == order: es_sort = '-' + es_sort # Quality filter quality_keys = [ '0', # Show all '1', # No remakes '2', # Only trusted '3' # Only completed ] if quality_filter.lower() not in quality_keys: flask.abort(400) quality_filter = int(quality_filter) # Category filter main_category = None sub_category = None main_cat_id = 0 sub_cat_id = 0 if category: cat_match = re.match(r'^(\d+)_(\d+)$', category) if not cat_match: flask.abort(400) main_cat_id = int(cat_match.group(1)) sub_cat_id = int(cat_match.group(2)) if main_cat_id > 0: if sub_cat_id > 0: sub_category = models.SubCategory.by_category_ids( main_cat_id, sub_cat_id) if not sub_category: flask.abort(400) else: main_category = models.MainCategory.by_id(main_cat_id) if not main_category: flask.abort(400) # This might be useless since we validate users # before coming into this method, but just to be safe... if user: user = models.User.by_id(user) if not user: flask.abort(404) user = user.id same_user = False if logged_in_user: same_user = user == logged_in_user.id s = Search(using=es_client, index=app.config.get('ES_INDEX_NAME')) # todo, sukebei prefix # Apply search term if term: # Do some preprocessing on the search terms for literal "" matching s = _parse_es_search_terms(s, term) # User view (/user/username) if user: s = s.filter('term', uploader_id=user) if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in user is not the same as the user being viewed, # show only torrents that aren't hidden or anonymous. # # If logged in user is the same as the user being viewed, # show all torrents including hidden and anonymous ones. # # On RSS pages in user view, show only torrents that # aren't hidden or anonymous no matter what if not same_user or rss: s = s.filter('term', hidden=False) s = s.filter('term', anonymous=False) # General view (homepage, general search view) else: if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in, show all torrents that aren't hidden unless they belong to you # On RSS pages, show all public torrents and nothing more. if logged_in_user and not rss: hiddenFilter = Q('term', hidden=False) userFilter = Q('term', uploader_id=logged_in_user.id) combinedFilter = hiddenFilter | userFilter s = s.filter('bool', filter=[combinedFilter]) else: s = s.filter('term', hidden=False) if main_category: s = s.filter('term', main_category_id=main_cat_id) elif sub_category: s = s.filter('term', main_category_id=main_cat_id) s = s.filter('term', sub_category_id=sub_cat_id) if quality_filter == 0: pass elif quality_filter == 1: s = s.filter('term', remake=False) elif quality_filter == 2: s = s.filter('term', trusted=True) elif quality_filter == 3: s = s.filter('term', complete=True) # Apply sort s = s.sort(es_sort) # Only show first RESULTS_PER_PAGE items for RSS if rss: s = s[0:per_page] else: max_page = min(page, int(math.ceil(max_search_results / float(per_page)))) from_idx = (max_page - 1) * per_page to_idx = min(max_search_results, max_page * per_page) s = s[from_idx:to_idx] highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT') if highlight: s = s.highlight_options(tags_schema='styled') s = s.highlight("display_name") # Return query, uncomment print line to debug query # from pprint import pprint # print(json.dumps(s.to_dict())) return s.execute()
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Add requested filters. if 'li' in search_params.data: s = _filter_licenses(s, search_params.data['li']) elif 'lt' in search_params.data: s = _filter_licenses(s, search_params.data['lt']) if 'provider' in search_params.data: provider_filters = [] for provider in search_params.data['provider'].split(','): provider_filters.append(Q('term', provider=provider)) s = s.filter('bool', should=provider_filters, minimum_should_match=1) if 'extension' in search_params.data: extension = search_params.data['extension'] extension_filter = Q('term', extension=extension) s = s.filter('bool', should=extension_filter, minimum_should_match=1) # It is sometimes desirable to hide content providers from the catalog # without scrubbing them from the database or reindexing. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set( key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers ) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query( 'query_string', query=query, fields=search_fields, type='most_fields' ) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query( 'query_string', query=creator, default_field='creator' ) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query( 'query_string', query=title, default_field='title' ) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query( 'query_string', default_field='tags.name', query=tags ) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip)) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] search_response = s.execute() results = _post_process_results( s, start, end, page_size, search_response, request, filter_dead ) result_count, page_count = _get_result_and_page_count( search_response, results, page_size ) return results, page_count, result_count
async def get_urls(*, projectName: str = Path(...), dateRange: Optional[List[str]] = Query(['', '']), urlPart: Optional[str] = '', UrlId: Optional[str] = '', statusFilter: Optional[List[str]] = Query(['']), categoryFilter: Optional[List[str]] = Query(['']), highlight: Optional[List[str]] = Query(['']), showReturn: Optional[List[str]] = Query(['']), currentPage: Optional[int] = 1, pageSize: Optional[int] = 10): # 查询 url表中的数据 # print(projectName,dateRange,urlPart, UrlId, currentPage,pageSize,statusFilter,categoryFilter) # projectName 转 projectId projectId = await findProjectIdFromProjectName( dbPrefix, 'Project', queryDict={'projectName': projectName}, showDict={'_id': 1}) #print(projectId) if not projectId: raise HTTPException(status_code=503, detail='projectNotExist') # 页码起始 start = 0 end = 0 # 带搜索的 es索引 (等价于 mongo中的 数据库) _index = f'kwm-{projectId}.urls'.lower() #print('_index', _index) ## 首先,更新 index 的 mapping,添加 {'fielddata':True} , 使得 word 和 topicWord字段,可以 整体进行操作 #xindex = Index(_index, using=esconnection) ## 给 topicWord 和 word字段 添加 "fielddata": True 属性 #xindex.put_mapping(using=esconnection, body={"properties": {"rootUrl": {"type": "text", "fielddata": True}}}) s = Search() #wordPart if urlPart: # 通配符 匹配查询 # 按照url 分隔符进行 分组: '/' urlPart = urlPart.replace(':', '') urlParts = urlPart.split('/') all = [] for ele in urlParts: all.extend(ele.split('.')) q = '' for urlPart in all: if urlPart: q += f'Q("wildcard", rootUrl=f"*{urlPart.strip()}*") &' #q = Q("wildcard", rootUrl=f"*{urlPart.strip()}*") & Q("wildcard", rootUrl=f"*{urlPart.strip()}*") q = q.rstrip('&') s = s.query(eval(q)) # category if categoryFilter != ['']: categoryFilter = unquote(categoryFilter[0], 'utf-8').split(',') # print(categoryFilter) categoryFilter = '\"' + '\" \"'.join(categoryFilter) + '\"' #print('ccc',categoryFilter) q = Q("query_string", query=categoryFilter, fields=['category']) s = s.query(q) #statusfilter if statusFilter != ['']: statusFilter = unquote(statusFilter[0], 'utf-8').split(',') statusFilter = '\"' + '\" \"'.join(statusFilter) + '\"' #print('ccc',statusFilter) q = Q("query_string", query=f"{statusFilter}", fields=['status']) s = s.query(q) # dateRange # 此处 因为 dateRange 的格式问题 会有一些问题,所以先 用两次判断 解决 if dateRange != ['', '']: dateRange = unquote(dateRange[0], 'utf-8').split(',') #print('dateRange', dateRange) if dateRange != ['', '']: #s = s.query('range',**{'timestamp': {'gte': dateRange[0], 'lt': dateRange[1]}}) # 这种也可以,为了统一Q,使用下面的表达式 r = Q( 'range', **{'modifiedTime': { 'gte': dateRange[0], 'lt': dateRange[1] }}) s = s.query(r) # 排序设定: 构造 排序 表达式,如果存在排序的话 s = s.source(includes=[]) # 返回哪些字段 if showReturn != ['']: showReturn = unquote(showReturn[0], 'utf-8').split(',') s = s.source(includes=showReturn) else: s = s.source(includes=[]) # 高亮哪些字段 if highlight != ['']: #highlight = ['rootUrl'] highlight = unquote(highlight[0], 'utf-8').split(',') #print(highlight) s = s.highlight_options(order='score') s = s.highlight_options(pre_tags="<strong>") s = s.highlight_options(post_tags="</strong>") for ele in highlight: # 每一个逐个添加高亮 s = s.highlight(ele) # 返回页码 if currentPage == 0 and pageSize == 0: # 返回所有数据 s = s[0: 10000] # 这里写死了10000, 如果超过,会报错。最好的解决方法是 用 scan,但是 scan 不会排序。后面再解决 else: start = (currentPage - 1) * pageSize end = start + pageSize s = s[start:end] # common setting #print(s.to_dict()) # 执行 try: response = await esRun(s.to_dict(), _index) #s.execute(ignore_cache=True) except Exception as e: print(e) return ({'count': 0, 'content': []}) else: totalCount = response.hits.total.value temp = response.to_dict()['hits']['hits'] result = [] for item in temp: tt = {'_id': {'$oid': item['_id']}} tt.update(item['_source']) if item.get('highlight'): tt.update({'highlight': item['highlight']}) if start >= 0 and end > 0: tt.update({'id': start + 1}) result.append(tt) start = start + 1 #print('final',result) return ({'count': totalCount, 'content': result})
def results(page): global tmp_name global tmp_pinyin global tmp_zodiac global tmp_difficulty global tmp_sentiment global tmp_char_num global gresults # convert the <page> parameter in url to integer. if type(page) is not int: page = int(page.encode('utf-8')) # if the method of request is post (for initial query), store query in local global variables if request.method == 'POST': name_query = request.form['name'] pinyin_query = request.form['pinyin'] zodiac_query = request.form['zodiac'] difficulty_query = request.form['difficulty'] sentiment_query = request.form['sentiment'] char_num_query = request.form['char_num'] tmp_name = name_query tmp_pinyin = pinyin_query tmp_zodiac = zodiac_query tmp_difficulty = difficulty_query tmp_sentiment = sentiment_query tmp_char_num = char_num_query else: name_query = tmp_name pinyin_query = tmp_pinyin zodiac_query = tmp_zodiac difficulty_query = tmp_difficulty sentiment_query = tmp_sentiment char_num_query = tmp_char_num shows = {} shows['name'] = name_query shows['pinyin'] = pinyin_query shows['zodiac'] = zodiac_query shows['difficulty'] = difficulty_query shows['sentiment'] = sentiment_query shows['char_num'] = char_num_query s = Search(index='idioms_search') if len(name_query) > 0: s = s.query('multi_match', query=name_query, type='cross_fields', fields=['name^4', 'english^4', 'desc_segmentation', 'desc_translation', 'synonym^2', 'source_translation', 'source_segmentation^2', 'story_translation', 'story_segmentation', 'usage_translation', 'usage_segmentation'], operator='and') if len(pinyin_query) > 0: q = Q('match', pinyin={'query': pinyin_query, 'operator': 'and'}) s = s.query(q) if len(zodiac_query) > 0: q = Q('match', zodiac=zodiac_query) s = s.query(q) if len(difficulty_query) > 0: q = Q('match', difficulty=difficulty_query) s = s.query(q) if len(sentiment_query) > 0: q = Q('match', sentiment=sentiment_query) s = s.query(q) if len(char_num_query) > 0: q = Q('match', char_num=char_num_query) s = s.query(q) s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>') s = s.highlight('name', fragment_size=999999999, number_of_fragments=5) s = s.highlight('pinyin', fragment_size=999999999, number_of_fragments=5) s = s.highlight('english', fragment_size=999999999, number_of_fragments=5) s = s.highlight('zodiac', fragment_size=999999999, number_of_fragments=5) s = s.highlight('desc_segmentation', fragment_size=999999999, number_of_fragments=5) s = s.highlight('desc_translation', fragment_size=999999999, number_of_fragments=5) s = s.highlight('source_segmentation', fragment_size=999999999, number_of_fragments=5) s = s.highlight('source_translation', fragment_size=999999999, number_of_fragments=5) s = s.highlight('story_segmentation', fragment_size=999999999, number_of_fragments=20) s = s.highlight('story_translation', fragment_size=999999999, number_of_fragments=20) s = s.highlight('usage_segmentation', fragment_size=999999999, number_of_fragments=5) s = s.highlight('usage_translation', fragment_size=999999999, number_of_fragments=5) s = s.highlight('difficulty', fragment_size=999999999, number_of_fragments=1) s = s.highlight('sentiment', fragment_size=999999999, number_of_fragments=1) # determine the subset of results to display (based on current <page> value) start = 0 + (page - 1) * 10 end = 10 + (page - 1) * 10 response = s[start:end].execute() # if response.hits.total == 0: # # if conjunction failed, make the query disjunctive for text field # search = Search(index='idioms_search') resultList = {} translation_hits = [] source_translation_hits = [] story_translation_hits = [] usage_translation_hits = [] for hit in response.hits: result = dict() result['score'] = hit.meta.score if 'highlight' in hit.meta: if 'name' in hit.meta.highlight: result['name'] = hit.meta.highlight.name[0] else: result['name'] = hit.name if 'english' in hit.meta.highlight: result['english'] = hit.meta.highlight.english[0] else: result['english'] = hit.english if 'pinyin' in hit.meta.highlight: result['pinyin'] = hit.meta.highlight.pinyin[0] else: result['pinyin'] = hit.pinyin if 'zodiac' in hit.meta.highlight: result['zodiac'] = hit.meta.highlight.zodiac else: result['zodiac'] = hit.zodiac if 'difficulty' in hit.meta.highlight: result['difficulty'] = hit.meta.highlight.difficulty[0] else: result['difficulty'] = hit.difficulty if 'sentiment' in hit.meta.highlight: result['sentiment'] = hit.meta.highlight.sentiment[0] else: result['sentiment'] = hit.sentiment if 'desc_translation' in hit.meta.highlight: result['desc_translation'] = hit.meta.highlight.desc_translation[0] translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.desc_translation] if 'source_translation' in hit.meta.highlight: result['source_translation'] = hit.meta.highlight.source_translation[0] source_translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.source_translation] if 'story_translation' in hit.meta.highlight: result['story_translation'] = hit.meta.highlight.story_translation[0] story_translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.story_translation] if 'usage_translation' in hit.meta.highlight: result['usage_translation'] = hit.meta.highlight.usage_translation[0] usage_translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.usage_translation] else: result['name'] = hit.name result['pinyin'] = hit.pinyin result['english'] = hit.english # result['description'] = hit.description result['zodiac'] = hit.zodiac result['difficulty'] = hit.difficulty result['sentiment'] = hit.sentiment sgmt = json_data[hit.meta.id]['Description_Segmentation'] sent_code = json_data[hit.meta.id]['Description_Sentence_Code'] sgmt_dict = dict() src_sgmt = json_data[hit.meta.id]['Source_Segmentation'] src_sent_code = json_data[hit.meta.id]['Source_Sentence_Code'] src_sgmt_dict = dict() story_sgmt = json_data[hit.meta.id]['Story_Segmentation'] story_sent_code = json_data[hit.meta.id]['Story_Sentence_Code'] story_sgmt_dict = dict() usage_sgmt = json_data[hit.meta.id]['Usage_Segmentation'] usage_sent_code = json_data[hit.meta.id]['Usage_Sentence_Code'] usage_sgmt_dict = dict() translation_index_hits = find_translations(translation_hits, sent_code) translation_src_hits = find_translations(source_translation_hits, src_sent_code) translation_story_hits = find_translations(story_translation_hits, story_sent_code) translation_usage_hits = find_translations(usage_translation_hits, usage_sent_code) make_sgmt_dict(sgmt, sgmt_dict) make_sgmt_dict(src_sgmt, src_sgmt_dict) make_sgmt_dict(story_sgmt, story_sgmt_dict) make_sgmt_dict(usage_sgmt, usage_sgmt_dict) result['desc_segmentation'] = sgmt_dict result['desc_sentence_code'] = sent_code result['translation_hits'] = translation_index_hits result['source_segmentation'] = src_sgmt_dict result['source_sentence_code'] = src_sent_code result['source_translation_hits'] = translation_src_hits result['story_segmentation'] = story_sgmt_dict result['story_sentence_code'] = story_sent_code result['story_translation_hits'] = translation_story_hits result['usage_segmentation'] = usage_sgmt_dict result['usage_sentence_code'] = usage_sent_code result['usage_translation_hits'] = translation_usage_hits resultList[hit.meta.id] = result # make the result list available globally gresults = resultList # total number of matching results result_num = response.hits.total # if results are found, extract title and text information from doc_data, else do nothing message = [] if result_num > 0: if result_num > 500: message.append('Over 500 search results! We recommend you narrow your search.') return render_template('page_SERP.html', results=resultList, res_num=result_num, page_num=page, queries=shows, zodiac=zodiac, sentiment=sentiment, difficulty=difficulty, char_num=char_num, warning=message, json_data=json_data) else: warning = None message.append(['One of the field you typed in cannot be found.']) return render_template('page_SERP.html', results=message, res_num=result_num, page_num=page, queries=shows, warning=warning, zodiac=zodiac, sentiment=sentiment, difficulty=difficulty, char_num=char_num, json_data=json_data)
async def getBasicWords(*, projectName: str = Path(...), dateRange: Optional[List[str]] = Query(['', '']), basicWordItemId: Optional[str] = None, highlight: Optional[List[str]] = Query(['']), showReturn: Optional[List[str]] = Query(['']), statusFilter: Optional[List[str]] = Query(['']), lengthFilter: Optional[List[str]] = Query(['']), weightFilter: Optional[List[str]] = Query(['']), categoryFilter: Optional[List[str]] = Query(['']), wordPart: Optional[str] = None, sortDict: Optional[str] = '{}', fullMatch: Optional[bool] = False, currentPage: Optional[int] = 1, pageSize: Optional[int] = 10): # 查询Project Name下的所有 符合条件的 基础词列表,使用es # 要查询的 es索引,类似于 mongodb中的数据库 # projectName 转 projectId #print(projectName,currentPage,pageSize,dateRange,fullMatch,basicWordItemId,statusFilter,lengthFilter,weightFilter,categoryFilter,wordPart,sortDict) projectId = await findProjectIdFromProjectName( dbPrefix, 'Project', queryDict={'projectName': projectName}, showDict={'_id': 1}) if not projectId: raise HTTPException(status_code=503, detail='projectNotExist') # 页码起始 start = 0 end = 0 # 带搜索的 es索引 (等价于 mongo中的 数据库) _index = f'kwm-{projectId}.basicwords'.lower() #print('_index', _index) s = Search() #wordPart if wordPart: q = Q("multi_match", query=f"{wordPart.strip()}", fields=['word']) s = s.query(q) # category if categoryFilter != ['']: categoryFilter = unquote(categoryFilter[0], 'utf-8').split(',') # print(categoryFilter) categoryFilter = '\"' + '\" \"'.join(categoryFilter) + '\"' #print('ccc',categoryFilter) q = Q("query_string", query=categoryFilter, fields=['category']) s = s.query(q) #statusfilter if statusFilter != ['']: statusFilter = unquote(statusFilter[0], 'utf-8').split(',') statusFilter = '\"' + '\" \"'.join(statusFilter) + '\"' #print('ccc',statusFilter) q = Q("query_string", query=f"{statusFilter}", fields=['status']) s = s.query(q) # dateRange # 此处 因为 dateRange 的格式问题 会有一些问题,所以先 用两次判断 解决 if dateRange != ['', '']: dateRange = unquote(dateRange[0], 'utf-8').split(',') #print('dateRange', dateRange) if dateRange != ['', '']: #s = s.query('range',**{'timestamp': {'gte': dateRange[0], 'lt': dateRange[1]}}) # 这种也可以,为了统一Q,使用下面的表达式 r = Q('range', **{'timestamp': { 'gte': dateRange[0], 'lt': dateRange[1] }}) s = s.query(r) # length if lengthFilter != ['']: lengthFilter = unquote(lengthFilter[0], 'utf-8').split(',') # 存在 lengthFilter 查询 #长度对应字典 lengthDict = { '1': [0, 3], '2': [3, 5], '3': [5, 8], '4': [8, 13], '5': [13, 18], '6': [18, 25] } ss = '' for ele in lengthFilter: ss = ss + '|' + f'Q("range",**{{"Length": {{"gte": {lengthDict[ele][0]},"lt": {lengthDict[ele][1]}}}}})' #print(ss[1:]) s = s.query(eval(ss[1:])) # 权重配置 if weightFilter != ['']: weightFilter = unquote(weightFilter[0], 'utf-8').split(',') #权重对应字典 weightDict = { '1': [0, 0.3], '2': [0.3, 0.5], '3': [0.5, 1], '4': [1, 5], '5': [5, 10], '6': [10, 20], '7': [20, 50] } ss = '' for ele in weightFilter: ss = ss + '|' + f'Q("range",**{{"weight": {{"gte": {weightDict[ele][0]},"lt": {weightDict[ele][1]}}}}})' #print(ss[1:]) s = s.query(eval(ss[1:])) # 排序设定: 构造 排序 表达式,如果存在排序的话 sortMap = {'desc': -1, 'asc': 1} #print('sortDict',sortDict) if sortDict != '{}': # 前端有 排序信息发过来,检查是否有效 # 装换 sortDict 为 字典 sortDict = json.loads(sortDict) for ele in list(sortDict.keys()): if sortDict[ele] == 'normal': sortDict.pop(ele) #print('sortDict',sortDict) if sortDict != {}: # 非空 sortDicttemp = [(ele, sortMap[sortDict[ele]]) for ele in sortDict] sortDict = sortDicttemp else: sortDict = [] #print('sortDict',sortDict) # 构造 排序命令 sorts = [] for ss in sortDict: if ss[1] == 1: # asc sorts.append(ss[0]) else: # desc sorts.append('-' + ss[0]) #print('sorts', sorts) s = s.sort(*sorts) else: s = s.sort('_id') # 返回哪些字段 if showReturn != ['']: showReturn = unquote(showReturn[0], 'utf-8').split(',') s = s.source(includes=showReturn) else: s = s.source(includes=[]) # 高亮哪些字段 if highlight != ['']: highlight = unquote(highlight[0], 'utf-8').split(',') #print(highlight) s = s.highlight_options(order='score') s = s.highlight_options(pre_tags="<strong>") s = s.highlight_options(post_tags="</strong>") for ele in highlight: # 每一个逐个添加高亮 s = s.highlight(ele) # 返回页码 if currentPage == 0 and pageSize == 0: # 返回所有数据 s = s[0: 10000] # 这里写死了10000, 如果超过,会报错。最好的解决方法是 用 scan,但是 scan 不会排序。后面再解决 else: start = (currentPage - 1) * pageSize end = start + pageSize s = s[start:end] # common setting #print(s.to_dict()) # 执行 try: response = await esRun(s.to_dict(), _index) #s.execute(ignore_cache=True) except Exception as e: print(e) return ({'count': 0, 'content': []}) else: totalCount = response.hits.total.value temp = response.to_dict()['hits']['hits'] result = [] for item in temp: tt = {'_id': {'$oid': item['_id']}} tt.update(item['_source']) if item.get('highlight'): tt.update({'highlight': item['highlight']}) if start >= 0 and end > 0: tt.update({'id': start + 1}) result.append(tt) start = start + 1 #print(result) return ({'count': totalCount, 'content': result})
def search_by_fields(self, title, authors, abstract, content, subject): search = Search(using=self.es, index='arxiv-index') query_title = Q() query_authors = Q() query_subject = Q() query_other = Q() query_abstract = Q() query_content = Q() if title: title = re.sub('[^A-Za-z0-9 ]+', '', title).lower() for word in title.split(' '): query_title = query_title + \ Q('wildcard', title='*' + word + '*') if authors: authors = re.sub('[^A-Za-z0-9 ]+', '', authors).lower() for author in authors.split(' '): query_authors = query_authors + \ Q('wildcard', authors='*' + author + '*') if subject and subject != 'all': query_subject = Q('wildcard', subject='*' + subject + '.*') query_other = Q('wildcard', other_subjects='*' + subject + '.*') if abstract: abstract = re.sub('[^A-Za-z0-9 ]+', '', abstract).lower() for word in abstract.split(' '): query_abstract = query_abstract + \ Q('wildcard', abstract='*' + word + '*') if content: content = re.sub('[^A-Za-z0-9 ]+', '', content).lower() for word in content.split(' '): query_content = query_content + \ Q('wildcard', pdf='*' + word + '*') final_query = Q('bool', must=[query_title, query_authors, query_subject], should=[query_abstract, query_content, query_other], minimum_should_match=2) total = search.count() search = search[0:total] search = search.query(final_query) search = search.source([ 'title', 'authors', 'subject', 'other_subjects', 'abstract', 'abstract_url', 'pdf_url', 'submit_date' ]) if content: search = self._extend_query(search, content) if abstract: search = self._extend_query(search, abstract) search = search.highlight_options(order='score') search = search.highlight('abstract', fragment_size=400) request = search.execute() for hit in request: response = hit.to_dict() if 'highlight' in hit.meta: response.update({'fragment': hit.meta.highlight.abstract}) else: response.update({'fragment': []}) yield response
def run_query(q, model, size, offset=0, facets={}, fuzzy=False, connection="default", page=None): """Run an Elasticsearch query. Arguments: q (str): the string to search model (str): one of 'chem', 'puc', 'product', or 'datadocument' size (int): the number of objects to return offset (optional int): the value to start at [default=0] page (optional int): the Django paginator page to return [default=None] facets (optional dict): a key, value pair to filter on. value can be a str or a list of strings [default={}] e.g. {'datadocument_grouptype': 'CO'} or {'datadocument_grouptype': ['CO', 'FU']} fuzzy (optional bool): enable fuzzy search [default=False] connection (optional str): which Elasticsearch instance to use [default="default"] Returns: { 'hits': a list of results, 'facets': a dictionary of facets, 'took': time in seconds of search, 'total': total results found } """ # make sure the model is valid validate_model(model) # get index to search on based on ELASTICSEARCH setting and con index = settings.ELASTICSEARCH.get(connection, {}).get("INDEX", "_all") # get the search object s = Search(using=connection, index=index) # filter on the facets for term, filter_array in facets.items(): s = s.filter("terms", **{term: filter_array}) # pull relevant fields id_field = get_id_field(model) fields = FIELD_DICT[id_field] # filter null id s = s.filter("exists", field=id_field) # Enable highlighting s = s.highlight_options(order="score") s = s.highlight("*") # add the query with optional fuzziness if fuzzy: s = s.query(MultiMatch(query=q, fields=fields, fuzziness="AUTO")) else: s = s.query(MultiMatch(query=q, fields=fields)) # collapse on id_field dict_update = {} inner_hits = [] for f in list(FIELD_DICT.keys()) + ["rawchem_id"]: inner_hits.append({"name": f, "collapse": {"field": f}, "size": 0}) dict_update.update( {"collapse": { "field": id_field, "inner_hits": inner_hits }}) # set the size of the result if page is not None: dict_update.update({"size": 0, "from": 0}) else: dict_update.update({"size": size, "from": offset}) s.update_from_dict(dict_update) # aggregate facets for facet in FACETS: a = A("terms", field=facet) a.metric("unique_count", "cardinality", field=id_field) s.aggs.bucket(facet, a) # add cardinal aggregation on id_field to get unique total count s.aggs.bucket(TOTAL_COUNT_AGG, A("cardinality", field=id_field)) # execute the search response = s.execute().to_dict() # gather the results # hits results_hits = [] for h in response["hits"]["hits"]: results_hits_object = { "id": h["_source"][id_field], "num_rawchem": h["inner_hits"]["rawchem_id"]["hits"]["total"]["value"], "num_truechem": h["inner_hits"]["truechem_dtxsid"]["hits"]["total"]["value"], "num_datadocument": h["inner_hits"]["datadocument_id"]["hits"]["total"]["value"], "num_product": h["inner_hits"]["product_id"]["hits"]["total"]["value"], "num_puc": h["inner_hits"]["puc_id"]["hits"]["total"]["value"], "highlights": h["highlight"], "source": h["_source"], } results_hits.append(results_hits_object) # available facets results_facets = {} response_aggs = response["aggregations"] for facet in FACETS: results_facets_data = response_aggs[facet] results_facets_list = [] for b in results_facets_data["buckets"]: results_facets_object = { "key": b["key"], "count": b["unique_count"]["value"], } results_facets_list.append(results_facets_object) results_facets[facet] = results_facets_list # get unique total count length = response_aggs[TOTAL_COUNT_AGG]["value"] # replace hits with paginator if page is not None: espaginator = ElasticPaginator(length, q, model, facets, fuzzy, connection="default") results_hits = Paginator(espaginator, size).get_page(page) return { "hits": results_hits, "facets": results_facets, "took": response["took"] / 1000, "total": length, }
def query(word: str, page: int, size: int, post_type: str, boards: list, sort: str, order: str, start: datetime = None, end: datetime = None, pos: bool = False, window_size: int = 10) -> dict: """Query word.""" s = Search(using=client, index='ptt') must = [Q('match', content=word)] if isinstance(post_type, str): must.append(Q('match', post_type=int(post_type)), ) s.query = Q( 'bool', must=must, should=[Q('match', board=board) for board in boards], minimum_should_match=1, ) # sort and order s = s.sort({sort: {'order': order}}) # filter date range s = s.filter('range', published={'gte': start, 'lte': end}) # highlight s = s.highlight_options(number_of_fragments=0) s = s.highlight('content') total = s.count() left_bound = page * size right_bound = left_bound + size data = [] if total: for i in s[left_bound:right_bound]: d = i.to_dict() if pos: segments = j.seg(d['content'], pos=True) for idx, (char, pos) in enumerate(segments): segments[idx] = f'{char}|{pos}' if char == word: segments[idx] = f'<em>{segments[idx]}</em>' left = idx - window_size if left < 0: left = 0 right = idx + window_size + 1 break d['concordance'] = ( ' '.join(segments[left:idx]), segments[idx], ' '.join(f'{char}|{pos}' for (char, pos) in segments[idx + 1:right]), ) else: concordance = i.meta.highlight.content[0].replace('\n ', '') concordance = concordance.split(' ') for idx, word in enumerate(concordance): if word.startswith('<em>'): left = idx - window_size if left < 0: left = 0 right = idx + window_size + 1 d['concordance'] = ( ' '.join(concordance[left:idx]), concordance[idx], ' '.join(concordance[idx + 1:right]), ) break data.append(d) output = { 'total': total, 'page': page, 'size': size, 'data': data, } return output