def get_data_from_es(endpoint, index, service, num=20, time=2, query=DEFALULT_QUERY): """Get data from elasticsearch using index name.""" es = Elasticsearch(endpoint, timeout=30) query["size"] = num query["filter"]["range"]["@timestamp"]["gte"] = "now-" + str(time) + "s" query["query"]["match"]["service"] = service return es.search(index, body=json.dumps(query), request_timeout=500)
def client(): global CLIENT if CLIENT is None: try: CLIENT = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT, retry_on_timeout=True, **settings.ELASTIC_KWARGS) logging.getLogger('elasticsearch').setLevel(logging.WARN) logging.getLogger('elasticsearch.trace').setLevel(logging.WARN) logging.getLogger('urllib3').setLevel(logging.WARN) logging.getLogger('requests').setLevel(logging.WARN) CLIENT.cluster.health(wait_for_status='yellow') except ConnectionError: message = ( 'The SEARCH_ENGINE setting is set to "elastic", but there ' 'was a problem starting the elasticsearch interface. Is ' 'elasticsearch running?') if settings.SENTRY_DSN: try: sentry.log_exception() sentry.log_message(message) except AssertionError: # App has not yet been initialized logger.exception(message) else: logger.error(message) exit(1) return CLIENT
def __init__(self, company_count: int = 100, cb_connect: str = SQL_CONNECT, es_connect: List[Dict] = ES_CONNECT, es_index: str = ES_INDEX): self.company_count = company_count # prep company selection select top-<limit> companies with most workers self.companies_select = CMPS_SELECT.format(limit=self.company_count) # connect to mysql crunshbase database self.sql_engine = create_engine(cb_connect) # connect to es instance self.es_client = Elasticsearch(list(es_connect)) self.es_index = es_index self.insertions = 0 if not self.es_client.ping(): raise ValueError("ElasticSearch Ping Failed")
def get_es_client(es_config): """Get ES client.""" if es_config['version'] == 2: from elasticsearch2 import Elasticsearch return Elasticsearch(host='localhost', port=9200) else: raise Exception('unsupported ES version: {}'.format( es_config['version']))
def process_date(current_date): """ Enriches all articles for the given date from the given index :param current_date: :param index_name: :return: """ total_docs = 0 start = current_date end = current_date + timedelta(days=1) query = { "query": { "constant_score": { "filter": { "bool": { "must": [{"range": { "dateHarvested": {"gte": start.strftime("%Y-%m-%d"), 'lte': end.strftime("%Y-%m-%d")}}} ] } } } } } batch_size = 2000 scroller = elastic.scroll( Elasticsearch(hosts=[ES_HOST], timeout=120, max_retries=10, retry_on_timeout=True), index=ES_INDEX, body=query, scroll='2m', clear_scroll=False, size=batch_size) docs = elastic.scroll_docs_mapped(scroller, mapper) for doc_batch in batch(docs, batch_size): doc_batch = list(doc_batch) suzi_input = [ {'title': d['title'], 'snip': d['snip']} for d in doc_batch ] events = score_articles(suzi_input) updates = [] for doc, doc_events in zip(doc_batch, events): for company_events in doc_events['events']: exploded = doc.copy() exploded['company_id'] = company_events['company_id'] exploded['sdr_scores'] = company_events['scores'] updates.append(exploded) if len(updates) > 0: mongo_collection.insert(updates) total_docs += len(doc_batch) logging.info("docs in day %i" % total_docs) return total_docs
class ESSearch(object): es_client: Elasticsearch es_index: str def __init__(self, connect=ES_CONNECT, index=ES_INDEX): self.es_client = Elasticsearch(connect) self.es_index = index def search_by_name(self, search: str): res = self.es_client.search( index=self.es_index, doc_type='company', body=dict(query=dict(match_phrase=dict(company_name=search)))) return [company['_source'] for company in res['hits']['hits']] def search_by_location(self, search: str): res = self.es_client.search( index=self.es_index, doc_type='company', body=dict(query=dict(match_phrase=dict(location=search)))) return [company['_source'] for company in res['hits']['hits']] def search_by_id(self, company_id: str): res = self.es_client.search( index=self.es_index, doc_type='company', body=dict(query=dict(match=dict(company_id=company_id)))) if len(res['hits']['hits']): return res['hits']['hits'][0]['_source'] else: return {} def search_by_text(self, text: str): res = self.es_client.search( index=self.es_index, doc_type='company', body=dict(query=dict(query_string=dict(query=text)))) return [company['_source'] for company in res['hits']['hits']]
import requests from elasticsearch2 import Elasticsearch from flask import render_template from flask import request from app import app from app import docSummary es = Elasticsearch(['https://73efa8624ce5b1aa7b0636a629e2d9f1.us-west-1.aws.found.io:9243/'], http_auth=('admin', 'jfnN6ArBrfnlD6accc0WatAy'), scheme="https") solr = 'http://*****:*****@35.230.16.178/solr/wiki/select' @app.route('/') @app.route('/index') def index(): return render_template('index.html', name='index') @app.route('/query/es/', methods=['GET']) def query_es(): search_word = request.args.get('q') term = { "query": { "filtered": { "query": { "query_string": { "query": "(" + search_word + ") AND (NOT(#redirect)) AND (NOT(#REDIRECT)) AND (NOT(.*jpg))", "fields": [
from dateutil.relativedelta import relativedelta import logging # Set up some logging logger = logging.getLogger('myapp') hdlr = logging.FileHandler('./logs/RestConnect-{:%Y.%m.%d}.log'.format( datetime.now())) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.DEBUG) # Set connection try: es = Elasticsearch(hosts='host', http_auth=('login', 'pass'), port=9200, timeout=600) logger.info('Connected: ', es.info()) except Exception as ex: logger.error(ex) def get_index(index, period, delta): if str(period).upper() == 'D': date = datetime.now() + relativedelta(days=-int(delta)) return '{}-{:%Y.%m.%d}'.format(index, date) else: date = datetime.now() + relativedelta(months=-int(delta)) return '{}-{:%Y.%m}'.format(index, date)
def _connect(self): self.es = Elasticsearch(self.config.storage.ES_ENDPOINT, timeout=60, max_retries=2)
class ESStorage(Storage): """Elasticsearch storage backend.""" NAME = "es" _MESSAGE_FIELD_NAME = "_source.message" def __init__(self, configuration): """Initialize Elasticsearch storage backend.""" super(ESStorage, self).__init__(configuration) self.config.storage = ESConfiguration() self._connect() def _connect(self): self.es = Elasticsearch(self.config.storage.ES_ENDPOINT, timeout=60, max_retries=2) def _prep_index_name(self, prefix): # appends the correct date to the index prefix now = datetime.datetime.now() date = now.strftime("%Y.%m.%d") index = prefix + date return index def retrieve(self, time_range: int, number_of_entires: int): """Retrieve data from ES.""" index_in = self._prep_index_name(self.config.storage.ES_INPUT_INDEX) query = { 'query': { 'match': { 'service': 'journal' } }, "filter": { "range": { "@timestamp": { "gte": "now-2s", "lte": "now" } } }, 'sort': { '@timestamp': { 'order': 'desc' } }, "size": 20 } _LOGGER.info( "Reading in max %d log entries in last %d seconds from %s", number_of_entires, time_range, self.config.storage.ES_ENDPOINT) query['size'] = number_of_entires query['filter']['range']['@timestamp']['gte'] = 'now-%ds' % time_range query['query']['match']['service'] = self.config.storage.ES_SERVICE es_data = self.es.search(index_in, body=json.dumps(query)) # only use _source sub-dict es_data = [x['_source'] for x in es_data['hits']['hits']] es_data_normalized = json_normalize(es_data) _LOGGER.info("%d logs loaded in from last %d seconds", len(es_data_normalized), time_range) self._preprocess(es_data_normalized) return es_data_normalized, es_data # bad solution, this is how Entry objects could come in. def store_results(self, data): """Store results back to ES.""" index_out = self._prep_index_name(self.config.storage.ES_TARGET_INDEX) actions = [{ "_index": index_out, "_type": "log", "_source": data[i] } for i in range(len(data))] helpers.bulk(self.es, actions, chunk_size=int(len(data) / 4) + 1)
#coding:utf-8 from elasticsearch2 import Elasticsearch from datetime import datetime es = Elasticsearch(hosts="10.10.6.6") es.index(index="keti10_10", doc_type="keti10_10", id=3, body={"bdcdyh": "123", "lx": '1',\ 'postDate':'2017-12-30 12:11:06','qx':'北京','records':2,'uuid':'00123dfad','zl':'北京海淀区'}) #doc=es.get(index="keti10_10", doc_type="keti10_10", id=1)['_source'] #print "doc is %s" % doc res = es.search(index="keti10_10", body={"query": { "match_phrase": { "zl": '北京' } }}) for hit in res['hits']['hits']: hitmap = hit['_source'] print "%(zl)s %(postDate)s" % hitmap
class SQLToESImporter(object): company_count: int companies_select: str sql_engine: Engine es_client: Elasticsearch es_index: str insertions: int def __init__(self, company_count: int = 100, cb_connect: str = SQL_CONNECT, es_connect: List[Dict] = ES_CONNECT, es_index: str = ES_INDEX): self.company_count = company_count # prep company selection select top-<limit> companies with most workers self.companies_select = CMPS_SELECT.format(limit=self.company_count) # connect to mysql crunshbase database self.sql_engine = create_engine(cb_connect) # connect to es instance self.es_client = Elasticsearch(list(es_connect)) self.es_index = es_index self.insertions = 0 if not self.es_client.ping(): raise ValueError("ElasticSearch Ping Failed") def pull(self) -> Dict: companies_result: ResultProxy try: with self.sql_engine.connect() as conn: companies_result = conn.execute(self.companies_select) for i, company in enumerate(companies_result): company_events = [] events_select = EVENTS_SELECT.format( company_id=company['company_id']) try: events_result = conn.execute(events_select) for event in events_result: try: company_events.append( dict(event_date=event['event_date'], event_code=event['event_code'], event_desc=event['event_desc'], event_url=event['event_url'])) except KeyError: raise except SQLAlchemyError as sq_e: raise try: company_document = dict( company_id=company['company_id'], company_name=company['company_name'], homepage_url=company['homepage_url'], logo_url=company['logo_url'], founded_date=company['founded_date'], country=company['country'], industry=company['industry'], location=company['location'], worker_count=company['worker_count'], events=company_events, ) except KeyError: raise yield company_document except SQLAlchemyError as sq_e: raise def push(self, company_document: Dict) -> bool: es_result = self.es_client.index(index=self.es_index, doc_type='company', id=self.insertions, body=company_document) if es_result['created']: self.insertions += 1 return True else: return False def delete_index(self): self.es_client.indices.delete(index=self.es_index, ignore=(400, 404)) def reimport(self) -> int: self.insertions = 0 self.delete_index() for company_document in self.pull(): self.push(company_document) return self.insertions
from django.shortcuts import render, render_to_response from django.http import HttpResponse, HttpResponseRedirect import requests import json import os from utils import utils from utils import dbinfo from database.base import MongoDatabase from elasticsearch2 import Elasticsearch from elasticsearch2 import helpers # ICD_SOURCE = "icd_source.txt" es = Elasticsearch() ES_SERVERS = [{'host': 'localhost', 'port': 9200}] es_client = Elasticsearch(hosts=ES_SERVERS) mongod = MongoDatabase() icds = {} # icds:{zd_lc:[],zd_gb:[]...} def icd_page(request): return render_to_response("match_icd.html", "") def icd_code_page(request): return render_to_response("match_icd_with_code.html", "")
"category": fields["category"], "sutra_body": fields["sutra_body"], } } ACTIONS.append(action) # batch proc success, _ = bulk(es, ACTIONS, index=index_name, raise_on_error=True) print('Performed %d actions' % success) #read command line args def read_args(): parser = argparse.ArgumentParser(description="Search Elastic Engine") parser.add_argument("-i", dest="input_file", action="store", help="input file", required=True) #parser.add_argument("-o", dest="output_file", action="store", help="output file", required=True) return parser.parse_args() if __name__ == '__main__': args = read_args() es = Elasticsearch(hosts=[settings.host + ':' + settings.port], timeout=5000) print(json.dumps(mapping)) set_mapping(es) # set_data(es, args.input_file)
def main(): """ Main function """ #elasticsearch config es_cfg = { 'input_index': 'kb-clean', 'dest_index': 'kb-clean-geo', 'host': 'localhost', 'port': 9200, 'timeout': 1000, 'size': 1000, 'scroll': '2m', 'doc_type': 'kb_clean', 'query_locations': { "query": { "terms": { "types": [ "Location", "Facility", "GeopoliticalEntity", "Physical.OrganizationLocationOrigin" ] } } }, 'query': { "query": { "match_all": {} } }, 'loc_types': [ "Location", "Facility", "GeopoliticalEntity", "Physical.OrganizationLocationOrigin" ], 'body': '''{"settings":{"index":{"number_of_shards":3,"number_of_replicas":0}},"mappings":{"kb_clean":{"properties":{"categories":{"type":"string","index":"not_analyzed"},"docIds":{"type":"string","index":"not_analyzed"},"edgeLabel":{"type":"string","index":"not_analyzed"},"edgeTarget":{"type":"string","index":"not_analyzed"},"hypotheses":{"type":"string","index":"not_analyzed"},"kbid":{"type":"string","index":"not_analyzed"},"name":{"type":"string","index":"not_analyzed"},"types":{"type":"string","index":"not_analyzed"},"x":{"type":"long"},"y":{"type":"long"},"geoLocation":{"properties":{"geohash":{"type":"string"},"lon":{"type":"double"},"lat":{"type":"double"}}}}}}}''' } #geonames config gn_cfg = { 'user': '******', 'url': 'http://api.geonames.org/', 'endpoint': 'searchJSON' } #estalbish elasticsearch connection es = Elasticsearch([{ 'host': es_cfg['host'], 'port': es_cfg['port'] }], timeout=es_cfg['timeout']) #create new destination index create_dest_index(es, es_cfg['dest_index'], es_cfg['body']) #execute process process_input_index(es, es_cfg, gn_cfg)
from __future__ import unicode_literals from collections import OrderedDict import re from fuzzywuzzy import fuzz from copy import deepcopy import codecs import sys import requests import json import utils from build_icd import build_icd_norm, build_icd_type_norm, build_icd_code_dict from elasticsearch2 import Elasticsearch es = Elasticsearch() reload(sys) sys.setdefaultencoding('utf-8') MATCH_COUNT = 10 ACCURACY = 55 ''' 预处理 ''' def get_config(type): source_dic = {} for line in open("config.txt").readlines(): t, k, v = line.strip().split(" ")
def __init__(self, connect=ES_CONNECT, index=ES_INDEX): self.es_client = Elasticsearch(connect) self.es_index = index
"latitude": 31.231706, "longitude": 121.472644 }, "rule_id": "EPM7J8KR6723", "src_port": 60417, "event_content": "", "response": "/accept", "dst_port": 80, "event_level": 0 } } if __name__ == "__main__": logging.info("==================== Start ====================") dst_es = Elasticsearch(hosts=config["dst_es"], sniff_on_start=True, sniff_on_connection_fail=True, timeout=120) bat = [] _id = 1 while True: item = copy.deepcopy(data) item["_id"] = _id bat.append(item) if (_id % 1000) == 0: helpers.bulk(client=dst_es, actions=bat, chunk_size=1000, max_chunk_bytes=209715200) bat = [] if (_id % 30000) == 0:
+ str(close_to)) for i in es_host.cat.indices().split('\n'): index = i.split() if len(i) > 4 and index[1] == "open": for j in close_to: if (j[0] in index[2]) and index[2] <= j[1]: indices.append(index[2]) indices.sort() return indices if __name__ == '__main__': logging.info("==================== Start ====================") es = Elasticsearch(hosts=config["es_cluster"], sniff_on_start=True, sniff_on_connection_fail=True, timeout=120) prefix = config["indices_prefix"] retain = config["retention_time"] dryrun = config["dry-run"] logging.info("let's do the job, carry small and live large...") logging.info( "IMPORTANT: the indices should follow the naming pattern xxxx_yyyyMMdd" ) indices_to_close = get_indices_to_be_closed(es_host=es, prefix_list=prefix, retention_days=retain) if dryrun: logging.info("<Dry-run> The following indices will be closed: " + str(indices_to_close))