Пример #1
0
def index_clusters():
    """ Index clusters in elasticsearch
    """
    es = get_es()
    es.indices.put_mapping(
        index=__index_name__, doc_type=__c_doc_type__, body={
            __c_doc_type__: {
                'properties': {
                    'keywords': {
                        "type": "string",
                        "position_increment_gap": 100
                    },
                    'items': {
                        "type": "string",
                        "position_increment_gap": 100
                    }
                }
            }
        })
    clusters = LoadClusters()
    for query in clusters.get_queries()['queries']:
        results = clusters.get_clusters(query)['children']
        for result in results:
            listings = list()
            for child in result['children']:
                listings.append(child['name'])
            doc = {
                'name': '{query}-{cluster}'.format(
                    query=query, cluster=result['name']),
                'keywords': result['keywords'],
                'items': listings
            }
            es.index(index=__index_name__, doc_type=__c_doc_type__, body=doc)
Пример #2
0
def get_news(day):
    from es import get_es
    """ 获取单独某一天的新闻 """
    try:
        es, index, doc_type = get_es('es_news_pub')
        body = {
            "query": {
                "term": {
                    "datetime": day
                }
            }
        }
        res = es.search(index=index, doc_type=doc_type, body=body)

        # 科技动态板块
        technology_news = []

        for r in res['hits']['hits']:
            tmp = r['_source']
            print(tmp)
            if tmp.get("category") == "technology_news":
                technology_news.append(tmp)

        package = {
            "technology_news": technology_news
        }

        return jsonify(
            success=True,
            data=package
        )
    except:
        traceback.print_exc()
def main():
    res = parse()
    es, index, doc_type = get_es('es_news_kr36')
    es1, index, doc_type = get_es('es_news_pub')
    for item in res:
        try:
            es.index(index=index,
                     doc_type=doc_type,
                     body=item,
                     id=item['title'])
            # 存入到pub中
            es1.index(index=index,
                      doc_type=doc_type,
                      body=item,
                      id=item['title'])
        except:
            print(item['title'])
Пример #4
0
def facet():
    es = get_es()
    q = {
            "query" : {
                "match_all" : {  }
                },
            "facets" : {
                "text" : {
                    "terms" : {
                        "field" : "text",
                        "size" : 100
                        }
                    }
                }
            }

    res = es.search(index='legal-index', body=q)['facets']
    pp.pprint(res)
    return res
Пример #5
0
 def __init__(self, file_name):
     self.es = get_es()
     self.file_name = file_name
Пример #6
0
#!/usr/bin/env python
import re
import pprint
from es import get_es
import sys
from search.models import Phrase
import json
pp = pprint.PrettyPrinter(indent=4)
es = get_es()


def get_autocomplete(term):
    s = {
            "text-suggest" : {
                "text" : term,
                "completion" : {
                    "field" : "suggest"
                    }
                }
            }
    res = es.suggest(index='autocomplete',body=s)
    result = []
    for r in res['text-suggest'][0]['options']:
        r_dict = {
                "id": r['text'].encode('utf-8'),
                "label": r['text'].encode('utf-8'),
                "value": r['text'].encode('utf-8')
                }
        result.append(r_dict)
    pp.pprint(result)
    return json.dumps(result)
Пример #7
0
def search(search_term,result,start_result=0):
    es = get_es()
    unquoted = dequoter(search_term).split('""')
    auto_phrases = []
    for u in unquoted:
        auto_phrases.extend(phraser(u.rstrip()))

    query_search_term = search_term
    if len(auto_phrases) > 0:
        query_search_term = search_term + ' ' + ' '.join(auto_phrases)
        pp.pprint(query_search_term)
    
    #phrase identifier
    q = {   
            "fields" : ["title","urlAddress","text"],
            "from" : start_result,
            "size" : result,
            "min_score": 0.1,
            "query": {
                "query_string": {
                    "query": query_search_term,
                                }               
                      },
            "highlight": {
                "pre_tags" : ["<b>"],
                "post_tags" : ["</b>"],
                "fields": {
                    "text": {
                        "fragment_size" : 150, 
                        "number_of_fragments": 4,
                        "no_match_size": 150,
                        "highlight_query": {
                            "query_string":{
                                "query":search_term.replace('"','')
                                }
                            }
                        }
                    }
                },
            "suggest" : {
                "text" : search_term,
                "simple_phrase" : {
                    "phrase" : {
                        "field" : "text",
                        "size" : 1,
                        "real_word_error_likelihood" : 0.95,
                        "max_errors" : 0.5,
                        "gram_size" : 2,
                        "direct_generator" : [ {
                            "field" : "text",
                            "suggest_mode" : "popular",
                            "min_word_len" : 1,
                            "min_doc_freq" : 0.01,
                            "max_term_freq" : 0.01
                            } ]                    
                        }
                    }
                }
            }

    res = es.search(index="legal-index", body=q)
    result = {}
    try:
        suggested =res['suggest']['simple_phrase'][0]['options'][0]['text'].encode('utf-8')

        result.update({'suggestion': suggested})
        result.update({'suggestion_urlencode': urllib.quote(suggested)})
        pp.pprint(urllib.urlencode(suggested))
    except:
        pass 

    r =  res['hits']['hits']
    l = []
    for re in r:
        d = {"urlAddress" : re['fields']['urlAddress'],
            #similar unicode bs
             "title" : str(re['fields']['title'][3:-2].encode('utf-8').replace("\\n",'').replace("u'","").replace("' ","").replace("\\r","").replace("',","").replace("\\t","").replace('u"','').replace('\u2019','')),


             "id" : re['_id'],
             "score" : re['_score'],}
        if d['title'] == '':
            d['title'] = d['urlAddress']
        try:
            h_list = []
            for h in re['highlight']['text']:
                #really a hacked up fix to remove rubbish terms in the highlights
                #thanks unicode not compatible python 2. thanks noob skills junyuan. thanks over encoding and ecoding of strings.
                h = h.encode('utf-8','ignore').replace("\\n",'').replace("u'","").replace("' ","")\
                        .replace("\\r","").replace("',","").replace("\\t","").replace('u"','').replace('\u2019','')
                if h[:2] in ['. ',') ',', ',"' ",")."]:
                    h = h[2:].strip()
                h_list.append(h)

            d.update({"highlight" : h_list})
        except:
            pass
        l.append(d)

    result_count = res['hits']['total']
    if not result_count:
        result_count = 0
        pp.pprint('result count zero')

    result.update({'result_list' : l})
    result.update({'result_count' : result_count})

    return result
Пример #8
0
 def __init__(self, seed_title, quality=False):
     self.seed = seed_title
     self.quality = quality
     self.es = get_es()
Пример #9
0
#!/usr/bin/env python

import re
import pprint
import sys
sys.path.append('/home/ec2-user/bblio/build/')
sys.path.append('/home/ec2-user/bblio/aws/')
import os
os.environ['DJANGO_SETTINGS_MODULE'] = 'Build.settings'
import time
from search.models import Document
import es
from io import StringIO, BytesIO
from lxml import etree

es = es.get_es()
_xpath =  "body/descendant::*[not(self::script|self::link)]"
_index =  "legal-index-html"
_doc_type = "legal-text-html"

def get_tree(text):
    text = re.sub('<strong>','',text)
    text = re.sub('</strong>','',text)
    text = re.sub('\n','',text)
    text = re.sub('\t','',text)
    text = re.sub(r'\s+',' ',text)
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(text.encode('utf-8').decode('utf-8')), parser)
    return tree

def get_body_html(text):