class SQLToESImporter(object):
    company_count: int
    companies_select: str
    sql_engine: Engine
    es_client: Elasticsearch
    es_index: str
    insertions: int

    def __init__(self,
                 company_count: int = 100,
                 cb_connect: str = SQL_CONNECT,
                 es_connect: List[Dict] = ES_CONNECT,
                 es_index: str = ES_INDEX):
        self.company_count = company_count
        # prep company selection select top-<limit> companies with most workers
        self.companies_select = CMPS_SELECT.format(limit=self.company_count)
        # connect to mysql crunshbase database
        self.sql_engine = create_engine(cb_connect)
        # connect to es instance
        self.es_client = Elasticsearch(list(es_connect))
        self.es_index = es_index
        self.insertions = 0
        if not self.es_client.ping():
            raise ValueError("ElasticSearch Ping Failed")

    def pull(self) -> Dict:
        companies_result: ResultProxy
        try:
            with self.sql_engine.connect() as conn:
                companies_result = conn.execute(self.companies_select)
                for i, company in enumerate(companies_result):
                    company_events = []
                    events_select = EVENTS_SELECT.format(
                        company_id=company['company_id'])
                    try:
                        events_result = conn.execute(events_select)
                        for event in events_result:
                            try:
                                company_events.append(
                                    dict(event_date=event['event_date'],
                                         event_code=event['event_code'],
                                         event_desc=event['event_desc'],
                                         event_url=event['event_url']))
                            except KeyError:
                                raise
                    except SQLAlchemyError as sq_e:
                        raise
                    try:
                        company_document = dict(
                            company_id=company['company_id'],
                            company_name=company['company_name'],
                            homepage_url=company['homepage_url'],
                            logo_url=company['logo_url'],
                            founded_date=company['founded_date'],
                            country=company['country'],
                            industry=company['industry'],
                            location=company['location'],
                            worker_count=company['worker_count'],
                            events=company_events,
                        )
                    except KeyError:
                        raise
                    yield company_document
        except SQLAlchemyError as sq_e:
            raise

    def push(self, company_document: Dict) -> bool:
        es_result = self.es_client.index(index=self.es_index,
                                         doc_type='company',
                                         id=self.insertions,
                                         body=company_document)
        if es_result['created']:
            self.insertions += 1
            return True
        else:
            return False

    def delete_index(self):
        self.es_client.indices.delete(index=self.es_index, ignore=(400, 404))

    def reimport(self) -> int:
        self.insertions = 0
        self.delete_index()
        for company_document in self.pull():
            self.push(company_document)
        return self.insertions
示例#2
0
#coding:utf-8
from elasticsearch2 import Elasticsearch
from datetime import datetime

es = Elasticsearch(hosts="10.10.6.6")

es.index(index="keti10_10", doc_type="keti10_10", id=3, body={"bdcdyh": "123", "lx": '1',\
 'postDate':'2017-12-30 12:11:06','qx':'北京','records':2,'uuid':'00123dfad','zl':'北京海淀区'})

#doc=es.get(index="keti10_10", doc_type="keti10_10", id=1)['_source']

#print "doc is %s" % doc

res = es.search(index="keti10_10",
                body={"query": {
                    "match_phrase": {
                        "zl": '北京'
                    }
                }})

for hit in res['hits']['hits']:
    hitmap = hit['_source']
    print "%(zl)s %(postDate)s" % hitmap