Пример #1
0
 def index_documents(self, name, documents, bulk=True):
     attempts = 0
     destination_index = ""
     is_box = False
     if name.startswith("boxinfo"):
         destination_index = self.boxinfo_write
         is_box = True
     else:
         destination_index = self.runindex_write
     while True:
         attempts += 1
         try:
             if bulk:
                 self.es.bulk_index(destination_index, name, documents)
             else:
                 self.es.index(destination_index, name, documents[0])
             return True
         except ElasticHttpError as ex:
             if attempts <= 1: continue
             self.logger.error(
                 'elasticsearch HTTP error. skipping document ' + name)
             if is_box == True: break
             #self.logger.exception(ex)
             return False
         except (socket.gaierror, ConnectionError, Timeout) as ex:
             if attempts > 100 and self.runMode:
                 raise (ex)
             self.logger.error('elasticsearch connection error. retry.')
             if self.stopping: return False
             time.sleep(0.1)
             ip_url = getURLwithIP(self.es_server_url, self.nsslock)
             self.es = ElasticSearch(ip_url, timeout=20, revival_delay=60)
             if is_box == True: break
     return False
Пример #2
0
	def __init__(self,group_name, topic_name, timeout=60, filename='config.txt'):
		self.logger = logging.getLogger(__name__)
		self.logger.setLevel(logging.INFO)
		handler = logging.FileHandler('../_logs/%s.log'%group_name)
		handler.setLevel(logging.INFO)
		formatter = logging.Formatter('%(asctime)s %(message)s')
		handler.setFormatter(formatter)
		self.logger.addHandler(handler)

		try:
			f = open(filename, 'r')
			self.hbasehost = f.readline().split(' ')[0]
			self.eshost = f.readline().split(' ')[0]
			self.kafkahost = f.readline().split(' ')[0]
			self.hdfshost = f.readline().split(' ')[0]
			self.logger.info('All Hosts Loaded')
		except Exception as e:
			self.logger.warning('file load error, %s'%filename)
			self.logger.warning(str(e))
			raise
			# sys.exit(0)

		self.group_name = group_name
		self.topic_name = topic_name
		self.timeout = timeout

		try:
			self.kafka = KafkaClient(self.kafkahost)
			self.pool = happybase.ConnectionPool(size=6, host=self.hbasehost)
			self.es = ElasticSearch(self.eshost)
		except Exception as e:
			self.logger.warning(str(e))
			raise
Пример #3
0
    def updateIndexMaybe(self, index_name, alias_write, alias_read, settings,
                         mapping):
        connectionAttempts = 0
        retry = False
        while True:
            if self.stopping: break
            connectionAttempts += 1
            try:
                if retry or self.ip_url == None:
                    self.ip_url = getURLwithIP(self.es_server_url,
                                               self.nsslock)
                    self.es = ElasticSearch(self.ip_url,
                                            timeout=20,
                                            revival_delay=60)

                #check if runindex alias exists
                if requests.get(self.es_server_url + '/_alias/' +
                                alias_write).status_code == 200:
                    self.logger.info('writing to elastic index ' +
                                     alias_write + ' on ' +
                                     self.es_server_url + ' - ' + self.ip_url)
                    self.createDocMappingsMaybe(alias_write, mapping)
                    break
                else:
                    time.sleep(.5)
                    if (connectionAttempts % 10) == 0:
                        self.logger.error(
                            'unable to access to elasticsearch alias ' +
                            alias_write + ' on ' + self.es_server_url + ' / ' +
                            self.ip_url)
                    continue
            except ElasticHttpError as ex:
                #es error, retry
                self.logger.error(ex)
                if self.runMode and connectionAttempts > 100:
                    self.logger.error(
                        'elastic (BU): exiting after 100 ElasticHttpError reports from '
                        + self.es_server_url)
                    sys.exit(1)
                elif self.runMode == False and connectionAttempts > 10:
                    self.threadEvent.wait(60)
                else:
                    self.threadEvent.wait(1)
                retry = True
                continue

            except (socket.gaierror, ConnectionError, Timeout) as ex:
                #try to reconnect with different IP from DNS load balancing
                if self.runMode and connectionAttempts > 100:
                    self.logger.error(
                        'elastic (BU): exiting after 100 connection attempts to '
                        + self.es_server_url)
                    sys.exit(1)
                elif self.runMode == False and connectionAttempts > 10:
                    self.threadEvent.wait(60)
                else:
                    self.threadEvent.wait(1)
                retry = True
                continue
    def __init__(self,
                 index_name,
                 settings_path,
                 host="http://127.0.0.1:9200"):

        self.connection = ElasticSearch(host)
        self.index_name = index_name
        self.settings_path = settings_path

        self.create_index()
Пример #5
0
 def __init__(self, es_server_url):
     self.server = ElasticSearch(es_server_url)
     self.datadict = {
         'prc-out': {
             "lookup": Query('prc-out', 'source'),
             "action": {
                 'definition':
                 Aggregator('drop'),
                 'data':
                 Aggregator({
                     'in': Aggregator('add'),
                     'out': Aggregator('add'),
                     'file': Aggregator('cat')
                 }),
                 'ls':
                 Aggregator('check'),
                 'stream':
                 Aggregator('check'),
                 'source':
                 Aggregator('match')
             }
         },
         'prc-in': {
             "lookup": Query('prc-in', 'dest'),
             "action": {
                 'definition': Aggregator('drop'),
                 'data': Aggregator({
                     'out': Aggregator('add'),
                 }),
                 'ls': Aggregator('check'),
                 'index': Aggregator('cat'),
                 'source': Aggregator('check'),
                 'dest': Aggregator('check'),
                 'process': Aggregator('cat')
             }
         },
         'prc-s-state': {
             "lookup": Query('prc-s-state'),
             "action": {
                 'macro': Aggregator('histoadd'),
                 'mini': Aggregator('histoadd'),
                 'micro': Aggregator('histoadd'),
                 'tp': Aggregator('add'),
                 'lead': Aggregator('avg'),
                 'nfiles': Aggregator('add'),
                 'ls': Aggregator('check'),
                 'process': Aggregator('cat')
             }
         }
     }
Пример #6
0
 def __init__(self, es_server_url, runstring, indexSuffix, monBufferSize,
              fastUpdateModulo):
     self.logger = logging.getLogger(self.__class__.__name__)
     self.istateBuffer = []
     self.prcinBuffer = {}
     self.prcoutBuffer = {}
     self.fuoutBuffer = {}
     self.es = ElasticSearch(es_server_url, timeout=20)
     self.hostname = os.uname()[1]
     self.hostip = socket.gethostbyname_ex(self.hostname)[2][0]
     #self.number_of_data_nodes = self.es.health()['number_of_data_nodes']
     self.settings = {"index.routing.allocation.require._ip": self.hostip}
     self.indexCreated = False
     self.indexFailures = 0
     self.monBufferSize = monBufferSize
     self.fastUpdateModulo = fastUpdateModulo
     aliasName = runstring + "_" + indexSuffix
     self.indexName = aliasName  # + "_" + self.hostname
Пример #7
0
def main():
    if len(sys.argv) > 3:
        print "Invalid argument number"
        sys.exit(1)
    if len(sys.argv) < 2:
        print "Please provide an elasticsearch server url (e.g. http://localhost:9200)"
        sys.exit(1)

    deleteOld = False
    if len(sys.argv) > 2:
        if "replace" in sys.argv[2]:
            deleteOld = True

    es_server_url = sys.argv[1]
    ip_url = getURLwithIP(es_server_url)
    es = ElasticSearch(es_server_url)

    #get_template
    #es.send_request('GET', ['_template', name],query_params=query_params)

    #list_template
    res = es.cluster_state(filter_routing_table=True,
                           filter_nodes=True,
                           filter_blocks=True)
    templateList = res['metadata']['templates']

    for template_name in TEMPLATES:
        if template_name not in templateList:
            print "{0} template not present. It will be created. ".format(
                template_name)
            create_template(es, template_name)
        else:
            if deleteOld == False:
                print "{0} already exists. Add 'replace' parameter to force update.".format(
                    template_name)
            else:
                print "{0} already exists.".format(template_name)
                delete_template(es, template_name)
                print "Deleted old template and will recreate {0}".format(
                    template_name)
                create_template(es, template_name)
Пример #8
0
    def write_es_geo(self,
                     es_host='http://localhost:9200/',
                     index_name="geos",
                     doc_type='user_geos'):
        # try to connect with ES and delete the index
        es = ElasticSearch('http://localhost:9200/')

        ## uncomment the following code to prompt check
        # print "Will delete all the doc in the [index:type] from ElasticSearch:"
        # print index_name, ":", doc_type
        # confirm = raw_input("Sure?(y/n)")
        # if confirm!="y":
        # 	sys.exit(0)

        try:
            create_es()
        except Exception as e:
            print "Error", e
        else:
            print index_name, ":", doc_type, " deleted!"
        # initializing the documents
        documents = []
        for record in self.userGeos:
            doc = {
                'uid': int(record[0]),
                'location': {
                    'lat': record[1],
                    'lon': record[2]
                }
            }
            documents.append(doc)
        print "Bulk indexing", len(documents), "documents.."
        es.bulk_index(index_name, doc_type, documents, id_field='uid')
        es.refresh(index_name)
        # test usage
        print "results from ES,"
        query = {"from": 0, "size": 2000, 'query': {"match_all": {}}}
        res = es.search(query, index=index_name)
        print len(res['hits']['hits']), "documents found"
        print "sample result"
        print res['hits']['hits'][0]
Пример #9
0
#!/bin/env python
import os, sys, time, datetime
import threading
from pyelasticsearch.client import ElasticSearch
import json
from ordereddict import OrderedDict

#es_hosts=['http://fuval-c2a11-02:9200','http://fuval-c2a11-03:9200','http://fuval-c2a11-15:9200']
#es_tribe_hosts=['http://fuval-c2a11-28:9200']

es_hosts = ['http://dvcu-ccsl6-01:9200']
es_tribe_hosts = ['http://dvtu-ccsl6-01:9200']

main_es = ElasticSearch(es_hosts[0])
tribe_es = ElasticSearch(es_tribe_hosts[0])
main_index = 'runindex'
setup = 'daq2val'


class query_maker(threading.Thread):
    def __init__(self, run):
        threading.Thread.__init__(self)
        self.running = True
        self.hostname = os.uname()[1]
        self.ip = {}
        self.runno = run
        self.known_streams = {}
        app_query = {
            "query": {
                "top_children": {
                    "score": "sum",
Пример #10
0
#!/usr/bin/python
from flask import Flask, request, json
import flask
import happybase
from pyelasticsearch.client import ElasticSearch
import hashlib
hbasehost = 'c0tl.com'
from struct import *
app = Flask(__name__)
es = ElasticSearch('http://*****:*****@app.route('/')
def home():
    return """<html>
  <h2>Welcome to the colt API!</h2>
</html>"""
Пример #11
0
from pyelasticsearch.client import ElasticSearch
import pyelasticsearch
import sys
import os

SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(SCRIPT_DIR + '/..'))

from config import settings

connection = ElasticSearch(settings.ES_HOST)


class QueryBuilder:
    def __init__(self, query, size):

        self.size = size
        self.should_query = []
        self.searchQuery = query
        self.edgeword, self.keyword = self.processText()

    def processText(self):

        text_split = self.searchQuery.split(" ")
        if len(text_split) == 1:
            return self.searchQuery, None

        edgeword = text_split.pop()
        keyword = " ".join(text_split)
        return edgeword, keyword
Пример #12
0
    command = sys.argv[1]
    server_url = sys.argv[2]
    index_name = sys.argv[3]
else:
    print "Parameters: command[create,alias,mapping] server url, index.alias name (target index)"
    print "  COMMANDS:"
    print "    create: create index"
    print "    alias: create index *_read and *_write aliases (optional parameter: target index)"
    print "    create missing document mappings for the index"
    sys.exit(1)

if server_url.startswith('http://') == False:
    server_url = 'http://' + server_url

#connection
es = ElasticSearch(server_url)

#pick mapping
if index_name.startswith('runindex'):
    my_settings = mappings.central_es_settings
    my_mapping = mappings.central_runindex_mapping
if index_name.startswith('boxinfo'):
    my_settings = mappings.central_es_settings,
    my_mapping = mappings.central_boxinfo_mapping
if index_name.startswith('hltdlogs'):
    my_settings = mappings.central_es_settings_hltlogs
    my_mapping = mappings.central_hltdlogs_mapping

#alias convention
alias_write = index_name + "_write"
alias_read = index_name + "_read"
Пример #13
0
    plist = x[1][1]
    pdict = {}
    for i in range(len(plist)):
        pdict[i] = json.loads(plist[i][1])
    with POOL.connection() as connection:
        tagview = connection.table('top_tags')
        rowkey = "%016i" % int(x[0]) + hashlib.md5(str(x[1][0])).digest()
        tagview.put(rowkey, {
            "p:tag": str(x[1][0]),
            "p:dump": json.dumps(pdict)
        })


#sample input
# (u"102", ((5, 5), "{"photo": {"timeposted": 1422939564, "description": "pdes", "tags": "ptag1,ptag3", "URL": "purl", "title": "ptitle", "pid": "102", "location": {"latitude": "plat", "longitude": "plon"}}}"))
ES = ElasticSearch("http://localhost:9200")


def saveESDocuments(x):
    print "writing to hbase.., pid,", x[0]
    parsedrawdata = json.loads(x[1][1])
    document = {
        "pid": int(x[0]),
        "likes": x[1][0][0],
        "views": x[1][0][1],
        "location": {
            "lat": parsedrawdata["photo"]["location"]["latitude"],
            "lon": parsedrawdata["photo"]["location"]["longitude"]
        }
    }
    ES.index('photo_geos', 'photos', document, id=document['pid'])