def index_documents(self, name, documents, bulk=True): attempts = 0 destination_index = "" is_box = False if name.startswith("boxinfo"): destination_index = self.boxinfo_write is_box = True else: destination_index = self.runindex_write while True: attempts += 1 try: if bulk: self.es.bulk_index(destination_index, name, documents) else: self.es.index(destination_index, name, documents[0]) return True except ElasticHttpError as ex: if attempts <= 1: continue self.logger.error( 'elasticsearch HTTP error. skipping document ' + name) if is_box == True: break #self.logger.exception(ex) return False except (socket.gaierror, ConnectionError, Timeout) as ex: if attempts > 100 and self.runMode: raise (ex) self.logger.error('elasticsearch connection error. retry.') if self.stopping: return False time.sleep(0.1) ip_url = getURLwithIP(self.es_server_url, self.nsslock) self.es = ElasticSearch(ip_url, timeout=20, revival_delay=60) if is_box == True: break return False
def __init__(self,group_name, topic_name, timeout=60, filename='config.txt'): self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) handler = logging.FileHandler('../_logs/%s.log'%group_name) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) try: f = open(filename, 'r') self.hbasehost = f.readline().split(' ')[0] self.eshost = f.readline().split(' ')[0] self.kafkahost = f.readline().split(' ')[0] self.hdfshost = f.readline().split(' ')[0] self.logger.info('All Hosts Loaded') except Exception as e: self.logger.warning('file load error, %s'%filename) self.logger.warning(str(e)) raise # sys.exit(0) self.group_name = group_name self.topic_name = topic_name self.timeout = timeout try: self.kafka = KafkaClient(self.kafkahost) self.pool = happybase.ConnectionPool(size=6, host=self.hbasehost) self.es = ElasticSearch(self.eshost) except Exception as e: self.logger.warning(str(e)) raise
def updateIndexMaybe(self, index_name, alias_write, alias_read, settings, mapping): connectionAttempts = 0 retry = False while True: if self.stopping: break connectionAttempts += 1 try: if retry or self.ip_url == None: self.ip_url = getURLwithIP(self.es_server_url, self.nsslock) self.es = ElasticSearch(self.ip_url, timeout=20, revival_delay=60) #check if runindex alias exists if requests.get(self.es_server_url + '/_alias/' + alias_write).status_code == 200: self.logger.info('writing to elastic index ' + alias_write + ' on ' + self.es_server_url + ' - ' + self.ip_url) self.createDocMappingsMaybe(alias_write, mapping) break else: time.sleep(.5) if (connectionAttempts % 10) == 0: self.logger.error( 'unable to access to elasticsearch alias ' + alias_write + ' on ' + self.es_server_url + ' / ' + self.ip_url) continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) if self.runMode and connectionAttempts > 100: self.logger.error( 'elastic (BU): exiting after 100 ElasticHttpError reports from ' + self.es_server_url) sys.exit(1) elif self.runMode == False and connectionAttempts > 10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry = True continue except (socket.gaierror, ConnectionError, Timeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts > 100: self.logger.error( 'elastic (BU): exiting after 100 connection attempts to ' + self.es_server_url) sys.exit(1) elif self.runMode == False and connectionAttempts > 10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry = True continue
def __init__(self, index_name, settings_path, host="http://127.0.0.1:9200"): self.connection = ElasticSearch(host) self.index_name = index_name self.settings_path = settings_path self.create_index()
def __init__(self, es_server_url): self.server = ElasticSearch(es_server_url) self.datadict = { 'prc-out': { "lookup": Query('prc-out', 'source'), "action": { 'definition': Aggregator('drop'), 'data': Aggregator({ 'in': Aggregator('add'), 'out': Aggregator('add'), 'file': Aggregator('cat') }), 'ls': Aggregator('check'), 'stream': Aggregator('check'), 'source': Aggregator('match') } }, 'prc-in': { "lookup": Query('prc-in', 'dest'), "action": { 'definition': Aggregator('drop'), 'data': Aggregator({ 'out': Aggregator('add'), }), 'ls': Aggregator('check'), 'index': Aggregator('cat'), 'source': Aggregator('check'), 'dest': Aggregator('check'), 'process': Aggregator('cat') } }, 'prc-s-state': { "lookup": Query('prc-s-state'), "action": { 'macro': Aggregator('histoadd'), 'mini': Aggregator('histoadd'), 'micro': Aggregator('histoadd'), 'tp': Aggregator('add'), 'lead': Aggregator('avg'), 'nfiles': Aggregator('add'), 'ls': Aggregator('check'), 'process': Aggregator('cat') } } }
def __init__(self, es_server_url, runstring, indexSuffix, monBufferSize, fastUpdateModulo): self.logger = logging.getLogger(self.__class__.__name__) self.istateBuffer = [] self.prcinBuffer = {} self.prcoutBuffer = {} self.fuoutBuffer = {} self.es = ElasticSearch(es_server_url, timeout=20) self.hostname = os.uname()[1] self.hostip = socket.gethostbyname_ex(self.hostname)[2][0] #self.number_of_data_nodes = self.es.health()['number_of_data_nodes'] self.settings = {"index.routing.allocation.require._ip": self.hostip} self.indexCreated = False self.indexFailures = 0 self.monBufferSize = monBufferSize self.fastUpdateModulo = fastUpdateModulo aliasName = runstring + "_" + indexSuffix self.indexName = aliasName # + "_" + self.hostname
def main(): if len(sys.argv) > 3: print "Invalid argument number" sys.exit(1) if len(sys.argv) < 2: print "Please provide an elasticsearch server url (e.g. http://localhost:9200)" sys.exit(1) deleteOld = False if len(sys.argv) > 2: if "replace" in sys.argv[2]: deleteOld = True es_server_url = sys.argv[1] ip_url = getURLwithIP(es_server_url) es = ElasticSearch(es_server_url) #get_template #es.send_request('GET', ['_template', name],query_params=query_params) #list_template res = es.cluster_state(filter_routing_table=True, filter_nodes=True, filter_blocks=True) templateList = res['metadata']['templates'] for template_name in TEMPLATES: if template_name not in templateList: print "{0} template not present. It will be created. ".format( template_name) create_template(es, template_name) else: if deleteOld == False: print "{0} already exists. Add 'replace' parameter to force update.".format( template_name) else: print "{0} already exists.".format(template_name) delete_template(es, template_name) print "Deleted old template and will recreate {0}".format( template_name) create_template(es, template_name)
def write_es_geo(self, es_host='http://localhost:9200/', index_name="geos", doc_type='user_geos'): # try to connect with ES and delete the index es = ElasticSearch('http://localhost:9200/') ## uncomment the following code to prompt check # print "Will delete all the doc in the [index:type] from ElasticSearch:" # print index_name, ":", doc_type # confirm = raw_input("Sure?(y/n)") # if confirm!="y": # sys.exit(0) try: create_es() except Exception as e: print "Error", e else: print index_name, ":", doc_type, " deleted!" # initializing the documents documents = [] for record in self.userGeos: doc = { 'uid': int(record[0]), 'location': { 'lat': record[1], 'lon': record[2] } } documents.append(doc) print "Bulk indexing", len(documents), "documents.." es.bulk_index(index_name, doc_type, documents, id_field='uid') es.refresh(index_name) # test usage print "results from ES," query = {"from": 0, "size": 2000, 'query': {"match_all": {}}} res = es.search(query, index=index_name) print len(res['hits']['hits']), "documents found" print "sample result" print res['hits']['hits'][0]
#!/bin/env python import os, sys, time, datetime import threading from pyelasticsearch.client import ElasticSearch import json from ordereddict import OrderedDict #es_hosts=['http://fuval-c2a11-02:9200','http://fuval-c2a11-03:9200','http://fuval-c2a11-15:9200'] #es_tribe_hosts=['http://fuval-c2a11-28:9200'] es_hosts = ['http://dvcu-ccsl6-01:9200'] es_tribe_hosts = ['http://dvtu-ccsl6-01:9200'] main_es = ElasticSearch(es_hosts[0]) tribe_es = ElasticSearch(es_tribe_hosts[0]) main_index = 'runindex' setup = 'daq2val' class query_maker(threading.Thread): def __init__(self, run): threading.Thread.__init__(self) self.running = True self.hostname = os.uname()[1] self.ip = {} self.runno = run self.known_streams = {} app_query = { "query": { "top_children": { "score": "sum",
#!/usr/bin/python from flask import Flask, request, json import flask import happybase from pyelasticsearch.client import ElasticSearch import hashlib hbasehost = 'c0tl.com' from struct import * app = Flask(__name__) es = ElasticSearch('http://*****:*****@app.route('/') def home(): return """<html> <h2>Welcome to the colt API!</h2> </html>"""
from pyelasticsearch.client import ElasticSearch import pyelasticsearch import sys import os SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(SCRIPT_DIR + '/..')) from config import settings connection = ElasticSearch(settings.ES_HOST) class QueryBuilder: def __init__(self, query, size): self.size = size self.should_query = [] self.searchQuery = query self.edgeword, self.keyword = self.processText() def processText(self): text_split = self.searchQuery.split(" ") if len(text_split) == 1: return self.searchQuery, None edgeword = text_split.pop() keyword = " ".join(text_split) return edgeword, keyword
command = sys.argv[1] server_url = sys.argv[2] index_name = sys.argv[3] else: print "Parameters: command[create,alias,mapping] server url, index.alias name (target index)" print " COMMANDS:" print " create: create index" print " alias: create index *_read and *_write aliases (optional parameter: target index)" print " create missing document mappings for the index" sys.exit(1) if server_url.startswith('http://') == False: server_url = 'http://' + server_url #connection es = ElasticSearch(server_url) #pick mapping if index_name.startswith('runindex'): my_settings = mappings.central_es_settings my_mapping = mappings.central_runindex_mapping if index_name.startswith('boxinfo'): my_settings = mappings.central_es_settings, my_mapping = mappings.central_boxinfo_mapping if index_name.startswith('hltdlogs'): my_settings = mappings.central_es_settings_hltlogs my_mapping = mappings.central_hltdlogs_mapping #alias convention alias_write = index_name + "_write" alias_read = index_name + "_read"
plist = x[1][1] pdict = {} for i in range(len(plist)): pdict[i] = json.loads(plist[i][1]) with POOL.connection() as connection: tagview = connection.table('top_tags') rowkey = "%016i" % int(x[0]) + hashlib.md5(str(x[1][0])).digest() tagview.put(rowkey, { "p:tag": str(x[1][0]), "p:dump": json.dumps(pdict) }) #sample input # (u"102", ((5, 5), "{"photo": {"timeposted": 1422939564, "description": "pdes", "tags": "ptag1,ptag3", "URL": "purl", "title": "ptitle", "pid": "102", "location": {"latitude": "plat", "longitude": "plon"}}}")) ES = ElasticSearch("http://localhost:9200") def saveESDocuments(x): print "writing to hbase.., pid,", x[0] parsedrawdata = json.loads(x[1][1]) document = { "pid": int(x[0]), "likes": x[1][0][0], "views": x[1][0][1], "location": { "lat": parsedrawdata["photo"]["location"]["latitude"], "lon": parsedrawdata["photo"]["location"]["longitude"] } } ES.index('photo_geos', 'photos', document, id=document['pid'])