def index_clusters(): """ Index clusters in elasticsearch """ es = get_es() es.indices.put_mapping( index=__index_name__, doc_type=__c_doc_type__, body={ __c_doc_type__: { 'properties': { 'keywords': { "type": "string", "position_increment_gap": 100 }, 'items': { "type": "string", "position_increment_gap": 100 } } } }) clusters = LoadClusters() for query in clusters.get_queries()['queries']: results = clusters.get_clusters(query)['children'] for result in results: listings = list() for child in result['children']: listings.append(child['name']) doc = { 'name': '{query}-{cluster}'.format( query=query, cluster=result['name']), 'keywords': result['keywords'], 'items': listings } es.index(index=__index_name__, doc_type=__c_doc_type__, body=doc)
def get_news(day): from es import get_es """ 获取单独某一天的新闻 """ try: es, index, doc_type = get_es('es_news_pub') body = { "query": { "term": { "datetime": day } } } res = es.search(index=index, doc_type=doc_type, body=body) # 科技动态板块 technology_news = [] for r in res['hits']['hits']: tmp = r['_source'] print(tmp) if tmp.get("category") == "technology_news": technology_news.append(tmp) package = { "technology_news": technology_news } return jsonify( success=True, data=package ) except: traceback.print_exc()
def main(): res = parse() es, index, doc_type = get_es('es_news_kr36') es1, index, doc_type = get_es('es_news_pub') for item in res: try: es.index(index=index, doc_type=doc_type, body=item, id=item['title']) # 存入到pub中 es1.index(index=index, doc_type=doc_type, body=item, id=item['title']) except: print(item['title'])
def facet(): es = get_es() q = { "query" : { "match_all" : { } }, "facets" : { "text" : { "terms" : { "field" : "text", "size" : 100 } } } } res = es.search(index='legal-index', body=q)['facets'] pp.pprint(res) return res
def __init__(self, file_name): self.es = get_es() self.file_name = file_name
#!/usr/bin/env python import re import pprint from es import get_es import sys from search.models import Phrase import json pp = pprint.PrettyPrinter(indent=4) es = get_es() def get_autocomplete(term): s = { "text-suggest" : { "text" : term, "completion" : { "field" : "suggest" } } } res = es.suggest(index='autocomplete',body=s) result = [] for r in res['text-suggest'][0]['options']: r_dict = { "id": r['text'].encode('utf-8'), "label": r['text'].encode('utf-8'), "value": r['text'].encode('utf-8') } result.append(r_dict) pp.pprint(result) return json.dumps(result)
def search(search_term,result,start_result=0): es = get_es() unquoted = dequoter(search_term).split('""') auto_phrases = [] for u in unquoted: auto_phrases.extend(phraser(u.rstrip())) query_search_term = search_term if len(auto_phrases) > 0: query_search_term = search_term + ' ' + ' '.join(auto_phrases) pp.pprint(query_search_term) #phrase identifier q = { "fields" : ["title","urlAddress","text"], "from" : start_result, "size" : result, "min_score": 0.1, "query": { "query_string": { "query": query_search_term, } }, "highlight": { "pre_tags" : ["<b>"], "post_tags" : ["</b>"], "fields": { "text": { "fragment_size" : 150, "number_of_fragments": 4, "no_match_size": 150, "highlight_query": { "query_string":{ "query":search_term.replace('"','') } } } } }, "suggest" : { "text" : search_term, "simple_phrase" : { "phrase" : { "field" : "text", "size" : 1, "real_word_error_likelihood" : 0.95, "max_errors" : 0.5, "gram_size" : 2, "direct_generator" : [ { "field" : "text", "suggest_mode" : "popular", "min_word_len" : 1, "min_doc_freq" : 0.01, "max_term_freq" : 0.01 } ] } } } } res = es.search(index="legal-index", body=q) result = {} try: suggested =res['suggest']['simple_phrase'][0]['options'][0]['text'].encode('utf-8') result.update({'suggestion': suggested}) result.update({'suggestion_urlencode': urllib.quote(suggested)}) pp.pprint(urllib.urlencode(suggested)) except: pass r = res['hits']['hits'] l = [] for re in r: d = {"urlAddress" : re['fields']['urlAddress'], #similar unicode bs "title" : str(re['fields']['title'][3:-2].encode('utf-8').replace("\\n",'').replace("u'","").replace("' ","").replace("\\r","").replace("',","").replace("\\t","").replace('u"','').replace('\u2019','')), "id" : re['_id'], "score" : re['_score'],} if d['title'] == '': d['title'] = d['urlAddress'] try: h_list = [] for h in re['highlight']['text']: #really a hacked up fix to remove rubbish terms in the highlights #thanks unicode not compatible python 2. thanks noob skills junyuan. thanks over encoding and ecoding of strings. h = h.encode('utf-8','ignore').replace("\\n",'').replace("u'","").replace("' ","")\ .replace("\\r","").replace("',","").replace("\\t","").replace('u"','').replace('\u2019','') if h[:2] in ['. ',') ',', ',"' ",")."]: h = h[2:].strip() h_list.append(h) d.update({"highlight" : h_list}) except: pass l.append(d) result_count = res['hits']['total'] if not result_count: result_count = 0 pp.pprint('result count zero') result.update({'result_list' : l}) result.update({'result_count' : result_count}) return result
def __init__(self, seed_title, quality=False): self.seed = seed_title self.quality = quality self.es = get_es()
#!/usr/bin/env python import re import pprint import sys sys.path.append('/home/ec2-user/bblio/build/') sys.path.append('/home/ec2-user/bblio/aws/') import os os.environ['DJANGO_SETTINGS_MODULE'] = 'Build.settings' import time from search.models import Document import es from io import StringIO, BytesIO from lxml import etree es = es.get_es() _xpath = "body/descendant::*[not(self::script|self::link)]" _index = "legal-index-html" _doc_type = "legal-text-html" def get_tree(text): text = re.sub('<strong>','',text) text = re.sub('</strong>','',text) text = re.sub('\n','',text) text = re.sub('\t','',text) text = re.sub(r'\s+',' ',text) parser = etree.HTMLParser() tree = etree.parse(StringIO(text.encode('utf-8').decode('utf-8')), parser) return tree def get_body_html(text):