class IndexData:
    def __init__(self,
                 index_name,
                 settings_path,
                 host="http://127.0.0.1:9200"):

        self.connection = ElasticSearch(host)
        self.index_name = index_name
        self.settings_path = settings_path

        self.create_index()

    def get_settings(self):

        config_file = file(self.settings_path)
        settings = json.load(config_file)
        return settings

    def create_index(self):

        settings = self.get_settings()
        try:
            self.connection.create_index(self.index_name, settings)
        except pyelasticsearch.exceptions.ElasticHttpError as e:
            self.connection.delete_index(self.index_name)
            self.connection.create_index(self.index_name, settings)

    def index_data(self, data_path, index_type):

        if index_type is None:
            raise "Please enter valid index type"
        objects = []
        count = 0
        with open(data_path) as f:
            for line in f:
                word_split = line.split("\t")
                cin = word_split[0]
                name = word_split[1].strip()
                doc = {'cin': cin, 'name': name}
                objects.append(doc)
                if len(objects) > 1000:
                    response = self.connection.bulk_index(self.index_name,
                                                          index_type,
                                                          objects,
                                                          id_field='cin')
                    objects = []
            self.connection.bulk_index(self.index_name,
                                       index_type,
                                       objects,
                                       id_field='cin')
class IndexData:

    def __init__(self, index_name, settings_path, host="http://127.0.0.1:9200"):
        
        self.connection = ElasticSearch(host)
        self.index_name = index_name
        self.settings_path = settings_path

        self.create_index()

    def get_settings(self):

        config_file = file(self.settings_path)
        settings = json.load(config_file)
        return settings

    def create_index(self):

        settings = self.get_settings()
        try:
            self.connection.create_index(self.index_name, settings)
        except pyelasticsearch.exceptions.ElasticHttpError as e:
            self.connection.delete_index(self.index_name)
            self.connection.create_index(self.index_name, settings)

    def index_data(self, data_path, index_type):

        if index_type is None:
            raise "Please enter valid index type"
        objects = []
        count = 0
        with open(data_path) as f:
            for line in f:
                word_split = line.split("\t")
                cin = word_split[0]
                name = word_split[1].strip()
                doc = {'cin':cin, 'name':name}
                objects.append(doc)
                if len(objects) > 1000:
                    response = self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin')
                    objects = []
            self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin')
Пример #3
0
class Indexer(object):
    def __init__(self, url='http://localhost:9200/', index='events'):
        self._es = ElasticSearch(url)
        self._es.json_encoder = ESJSONEncoder
        self._index = index


    def cleanup(self):
        try:
            self._es.delete_index(self._index)
        except ElasticHttpNotFoundError:
            pass
        self._es.create_index(self._index, settings={
            'index': {
                'mapper': {
                    'dynamic': False
                }
            }
        })

        not_analyzed_mapping = {
            'properties': {
                'timestamp': {'type': 'date', 'format': 'dateOptionalTime'},
                'source': {'type': 'string', 'index': 'not_analyzed'},
                'venue': {'type': 'string', 'index': 'not_analyzed'},
                'poster': {'type': 'string', 'index': 'not_analyzed'},
                'delta': {'type': 'integer'}
            }
        }

        analyzed_mapping = {
            'properties': {
                'timestamp': {'type': 'date', 'format': 'dateOptionalTime'},
                'source': {'type': 'string', 'analyzer': 'keyword'},
                'venue': {'type': 'string', 'analyzer': 'keyword'},
                'poster': {'type': 'string', 'analyzer': 'keyword'},
                'delta': {'type': 'integer'}
            }
        }

        hybrid_mapping = {
            'properties': {
                'timestamp': {'type': 'date', 'format': 'dateOptionalTime'},
                'source': {'type': 'string', 'analyzer': 'keyword'},
                'venue': {'type': 'string', 'analyzer': 'whitespace'},
                'poster': {'type': 'string', 'analyzer': 'whitespace'},
                'delta': {'type': 'integer'}
            }
        }


        mapping = not_analyzed_mapping

        self._es.put_mapping(self._index, 'post', {'post': mapping})

    def add(self, event):

        data = {
            'timestamp': event['timestamp'],
            'source': event['_id']['source'],
            'venue': '{}-{}'.format(event['_id']['source'], event['venue']),
            'poster': '{}-{}'.format(event['_id']['source'], event['poster']),
            'delta': event.get('delta', 1)
        }

        self._es.index(
            self._index,
            event.get('type').lower(),
            data,
            id='{source}-{id}'.format(**event['_id'])
        )
Пример #4
0
class elasticBand():


    def __init__(self,es_server_url,runstring,indexSuffix,monBufferSize,fastUpdateModulo):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.istateBuffer = []  
        self.prcinBuffer = {}   # {"lsX": doclist}
        self.prcoutBuffer = {}
        self.fuoutBuffer = {}
        self.es = ElasticSearch(es_server_url) 
        self.settings = {
            "analysis":{
                "analyzer": {
                    "prefix-test-analyzer": {
                        "type": "custom",
                        "tokenizer": "prefix-test-tokenizer"
                    }
                },
                "tokenizer": {
                    "prefix-test-tokenizer": {
                        "type": "path_hierarchy",
                        "delimiter": "_"
                    }
                }
            },
            "index":{
                'number_of_shards' : 16,
                'number_of_replicas' : 1
            }
        }

        self.run_mapping = {
            'prc-i-state' : {
                'properties' : {
                    'macro'     : {'type' : 'integer'},
                    'mini'      : {'type' : 'integer'},
                    'micro'     : {'type' : 'integer'},
                    'tp'        : {'type' : 'double' },
                    'lead'      : {'type' : 'double' },
                    'nfiles'    : {'type' : 'integer'},
                    'fm_date'   : {'type' : 'date'   }
                },
                '_timestamp' : { 
                    'enabled'   : True,
                    'store'     : "yes",
                    "path"      : "fm_date"
                },
                '_ttl'       : { 'enabled' : True,                             
                                 'default' :  '5m'
                } 
            },
            'prc-s-state' : {
                'properties' : {
                    'macro'  : {'type' : 'integer'},
                    'mini'   : {'type' : 'integer'},
                    'micro'  : {'type' : 'integer'},
                    'tp'     : {'type' : 'double'},
                    'lead'   : {'type' : 'double'},
                    'nfiles' : {'type' : 'integer'},            
                    'ls'     : {'type' : 'integer'},
                    'process': {'type' : 'string'}
                },
            },
            'fu-s-state' : {
                'properties' : {
                    'macro'  : {'type' : 'integer'},
                    'mini'   : {'type' : 'integer'},
                    'micro'  : {'type' : 'integer'},
                    'tp'     : {'type' : 'double'},
                    'lead'   : {'type' : 'double'},
                    'nfiles' : {'type' : 'integer'},            
                    'ls'     : {'type' : 'integer'},
                    'machine': {'type' : 'string'}
                }
            },
            'prc-out': {
                '_routing' :{
                    'required' : True,
                    'path'     : 'source'
                },
                'properties' : {
                    #'definition': {'type': 'string'},
                    'data' : { 'properties' : {
                            'in' : { 'type' : 'integer'},
                            'out': { 'type' : 'integer'},
                            'file': { 'type' : 'string','index' : 'not_analyzed'}
                            }           
                    },
                    'ls' : { 
                        'type' : 'integer',
                        'store': "yes"
                    },
                    'stream' : {'type' : 'string','index' : 'not_analyzed'},
                    'source' : {
                        'type' : 'string',
                        'index_analyzer': 'prefix-test-analyzer',
                        'search_analyzer': "keyword",
                        'store' : "yes",
                        'index' : "analyzed"
                    }
                },
                '_timestamp' : { 
                    'enabled' : True,
                    'store'   : "yes"
                }
            },
            'prc-in': {
                '_routing' :{
                    'required' : True,
                    'path'     : 'dest'
                },
                'properties' : {
                    #'definition': {'type': 'string',"index" : "not_analyzed"},
                    'data' : { 'properties' : {
                            'out'    : { 'type' : 'integer'}
                            }
                    },
                    'ls'     : { 
                        'type' : 'integer',
                        'store': 'yes'
                    },
                    'index'  : { 'type' : 'integer' },
                    'source' : { 'type' : 'string'  },
                    'dest' : {
                        'type' : 'string',
                        'index_analyzer': 'prefix-test-analyzer',
                        'search_analyzer': "keyword",
                        'store' : "yes",
                        'index' : "analyzed",
                        },
                    'process' : { 'type' : 'integer' }
                },
                '_timestamp' : { 
                    'enabled' : True,
                    'store'   : "yes"
                }
            },
            'fu-out': {
                '_routing' :{
                    'required' : True,
                    'path'     : 'source'
                },
                'properties' : {
                    #'definition': {'type': 'string',"index" : "not_analyzed"},
                    'data' : { 'properties' : {
                            'in' : { 'type' : 'integer'},
                            'out': { 'type' : 'integer'},
                            'errorEvents' : {'type' : 'integer'},
                            'returnCodeMask': {'type':'string',"index" : "not_analyzed"},
                            'fileSize' : {'type':'long'},
                            'files': {
                                'properties' : {
                                    'name' : { 'type' : 'string',"index" : "not_analyzed"}
                                    }
                                }
                             }
                    },
                    'ls' : { 'type' : 'integer' },
                    'stream' : {'type' : 'string','index' : 'not_analyzed'},
                    'source' : {
                        'type' : 'string',
                        'index_analyzer': 'prefix-test-analyzer',
                        'search_analyzer': "keyword"
                    }
                },
                '_timestamp' : { 
                    'enabled' : True,
                    'store'   : "yes"
                }
            },
            'fu-complete' : {
                'properties' : {
                    'host'     : {'type' : 'string'},
                    'fm_date'   : {'type' : 'date' }
                },
                '_timestamp' : { 
                    'enabled'   : True,
                    'store'     : "yes",
                    "path"      : "fm_date"
                },
            },
            'bu-out': {
                'properties' : {
                    #'definition': {'type': 'string',"index" : "not_analyzed"},
                    'out': { 'type' : 'integer'},
                    'ls' : { 'type' : 'integer' },
                    'source' : {'type' : 'string'}#,"index" : "not_analyzed"}
                }
            },
            'cmsswlog' : {
                '_timestamp' : { 
                    'enabled'   : True,
                    'store'     : "yes"
                },
                '_ttl'       : { 'enabled' : True,
                              'default' :  '30d'}
                ,
                'properties' : {
                    'host'      : {'type' : 'string'},
                    'pid'       : {'type' : 'integer'},
                    'type'      : {'type' : 'string',"index" : "not_analyzed"},
                    'severity'  : {'type' : 'string',"index" : "not_analyzed"},
                    'severityVal'  : {'type' : 'integer'},
                    'category'  : {'type' : 'string'},

                    'fwkState'     : {'type' : 'string',"index" : "not_analyzed"},
                    'module'     : {'type' : 'string',"index" : "not_analyzed"},
                    'moduleInstance'     : {'type' : 'string',"index" : "not_analyzed"},
                    'moduleCall'     : {'type' : 'string',"index" : "not_analyzed"},
                    'lumi'     : {'type' : 'integer'},
                    'eventInPrc'     : {'type' : 'long'},

                    'message'   : {'type' : 'string'},#,"index" : "not_analyzed"},
                    'lexicalId' : {'type' : 'string',"index" : "not_analyzed"},
                    'msgtime' : {'type' : 'date','format':'dd-MMM-YYYY HH:mm:ss'},
                    'msgtimezone' : {'type' : 'string'}
                    #'context'   : {'type' : 'string'}
                 }
            }
        }
        self.run = runstring
        self.monBufferSize = monBufferSize
        self.fastUpdateModulo = fastUpdateModulo
        self.indexName = runstring + "_"+indexSuffix
        try:
            self.es.create_index(self.indexName, settings={ 'settings': self.settings, 'mappings': self.run_mapping })
        except ElasticHttpError as ex:
#            print "Index already existing - records will be overridden"
            #this is normally fine as the index gets created somewhere across the cluster
            pass

    def imbue_jsn(self,infile):
        with open(infile.filepath,'r') as fp:
            try:
                document = json.load(fp)
            except json.scanner.JSONDecodeError,ex:
                logger.exception(ex)
                return None,-1
            return document,0
Пример #5
0
class elasticBandBU:

    def __init__(self,es_server_url,runnumber,startTime,runMode=True):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.es_server_url=es_server_url
        self.index_name=conf.elastic_runindex_name
        self.runnumber = str(runnumber)
        self.startTime = startTime
        self.host = os.uname()[1]
        self.stopping=False
        self.threadEvent = threading.Event()
        self.runMode=runMode
        self.settings = {
            "analysis":{
                "analyzer": {
                    "prefix-test-analyzer": {
                        "type": "custom",
                        "tokenizer": "prefix-test-tokenizer"
                    }
                },
                "tokenizer": {
                    "prefix-test-tokenizer": {
                        "type": "path_hierarchy",
                        "delimiter": " "
                    }
                }
             },
            "index":{
                'number_of_shards' : 10,
                'number_of_replicas' : 3
            },
        }

        self.run_mapping = {
            'run' : {
#                '_routing' :{
#                    'required' : True,
#                    'path'     : 'runNumber'
#                },
                '_id' : {
                    'path' : 'runNumber'
                },
                'properties' : {
                    'runNumber':{
                        'type':'integer'
                        },
                    'startTimeRC':{
                        'type':'date'
                            },
                    'stopTimeRC':{
                        'type':'date'
                            },
                    'startTime':{
                        'type':'date'
                            },
                    'endTime':{
                        'type':'date'
                            },
                    'completedTime' : {
                        'type':'date'
                            }
                },
                '_timestamp' : {
                    'enabled' : True,
                    'store'   : 'yes'
                    }
            },
            'microstatelegend' : {

                '_id' : {
                    'path' : 'id'
                },
                '_parent':{'type':'run'},
                'properties' : {
                    'names':{
                        'type':'string'
                        },
                    'id':{
                        'type':'string'
                        }
                    }
            },
            'pathlegend' : {

                '_id' : {
                    'path' : 'id'
                },
                '_parent':{'type':'run'},
                'properties' : {
                    'names':{
                        'type':'string'
                        },
                    'id':{
                        'type':'string'
                        }

                    }
                },
            'boxinfo' : {
                '_id'        :{'path':'id'},#TODO:remove
                'properties' : {
                    'fm_date'       :{'type':'date'},
                    'id'            :{'type':'string'},
                    'broken'        :{'type':'integer'},
                    'used'          :{'type':'integer'},
                    'idles'         :{'type':'integer'},
                    'quarantined'   :{'type':'integer'},
                    'usedDataDir'   :{'type':'integer'},
                    'totalDataDir'  :{'type':'integer'},
                    'usedRamdisk'   :{'type':'integer'},
                    'totalRamdisk'  :{'type':'integer'},
                    'usedOutput'    :{'type':'integer'},
                    'totalOutput'   :{'type':'integer'},
                    'activeRuns'    :{'type':'string'}
                    },
                '_timestamp' : { 
                    'enabled'   : True,
                    'store'     : "yes",
                    "path"      : "fm_date"
                    },
                '_ttl'       : { 'enabled' : True,
                              'default' :  '30d'
                    }
                },

            'boxinfo_last' : {
                '_id'        :{'path':'id'},
                'properties' : {
                    'fm_date'       :{'type':'date'},
                    'id'            :{'type':'string'},
                    'broken'        :{'type':'integer'},
                    'used'          :{'type':'integer'},
                    'idles'         :{'type':'integer'},
                    'quarantined'   :{'type':'integer'},
                    'usedDataDir'   :{'type':'integer'},
                    'totalDataDir'  :{'type':'integer'},
                    'usedRamdisk'   :{'type':'integer'},
                    'totalRamdisk'  :{'type':'integer'},
                    'usedOutput'    :{'type':'integer'},
                    'totalOutput'   :{'type':'integer'},
                    'activeRuns'    :{'type':'string'}
                    },
                '_timestamp' : { 
                    'enabled'   : True,
                    'store'     : "yes",
                    "path"      : "fm_date"
                    }
                },

            'eols' : {
                '_id'        :{'path':'id'},
                '_parent'    :{'type':'run'},
                'properties' : {
                    'fm_date'       :{'type':'date'},
                    'id'            :{'type':'string'},
                    'ls'            :{'type':'integer'},
                    'NEvents'       :{'type':'integer'},
                    'NFiles'        :{'type':'integer'},
                    'TotalEvents'   :{'type':'integer'}
                    },
                '_timestamp' : { 
                    'enabled'   : True,
                    'store'     : "yes",
                    "path"      : "fm_date"
                    },
                },
            'minimerge' : {
                '_id'        :{'path':'id'},
                '_parent'    :{'type':'run'},
                'properties' : {
                    'fm_date'       :{'type':'date'},
                    'id'            :{'type':'string'}, #run+appliance+stream+ls
                    'appliance'     :{'type':'string'},
                    'stream'        :{'type':'string','index' : 'not_analyzed'},
                    'ls'            :{'type':'integer'},
                    'processed'     :{'type':'integer'},
                    'accepted'      :{'type':'integer'},
                    'errorEvents'   :{'type':'integer'},
                    'size'          :{'type':'integer'},
                    }
                }
            }


        connectionAttempts=0
        while True:
            if self.stopping:break
            connectionAttempts+=1
            try:
                self.logger.info('writing to elastic index '+self.index_name)
                ip_url=getURLwithIP(es_server_url)
                self.es = ElasticSearch(es_server_url)
                self.es.create_index(self.index_name, settings={ 'settings': self.settings, 'mappings': self.run_mapping })
                break
            except ElasticHttpError as ex:
                #this is normally fine as the index gets created somewhere across the cluster
                if "IndexAlreadyExistsException" in str(ex):
                    self.logger.info(ex)
                    break
                else:
                    self.logger.error(ex)
                    if runMode and connectionAttempts>100:
                        self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ es_server_url)
                        sys.exit(1)
                    elif runMode==False and connectionAttempts>10:
                        self.threadEvent.wait(60)
                    else:
                        self.threadEvent.wait(1)
                    continue

            except (ConnectionError,Timeout) as ex:
                #try to reconnect with different IP from DNS load balancing
                if runMode and connectionAttempts>100:
                   self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ es_server_url)
                   sys.exit(1)
                elif runMode==False and connectionAttempts>10:
                   self.threadEvent.wait(60)
                else:
                   self.threadEvent.wait(1)
                continue
            
        #write run number document
        if runMode == True:
            document = {}
            document['runNumber'] = self.runnumber
            document['startTime'] = startTime
            documents = [document]
            self.index_documents('run',documents)
            #except ElasticHttpError as ex:
            #    self.logger.info(ex)
            #    pass

    def resetURL(url):
        self.es = None
        self.es = ElasticSearch(url)

    def read_line(self,fullpath):
        with open(fullpath,'r') as fp:
            return fp.readline()
    
    def elasticize_modulelegend(self,fullpath):

        self.logger.info(os.path.basename(fullpath))
        stub = self.read_line(fullpath)
        document = {}
        document['_parent']= self.runnumber
        document['id']= "microstatelegend_"+self.runnumber
        document['names']= self.read_line(fullpath)
        documents = [document]
        return self.index_documents('microstatelegend',documents)


    def elasticize_pathlegend(self,fullpath):

        self.logger.info(os.path.basename(fullpath))
        stub = self.read_line(fullpath)
        document = {}
        document['_parent']= self.runnumber
        document['id']= "pathlegend_"+self.runnumber
        document['names']= self.read_line(fullpath)
        documents = [document]
        return self.index_documents('pathlegend',documents)

    def elasticize_runend_time(self,endtime):

        self.logger.info(str(endtime)+" going into buffer")
        document = {}
        document['runNumber'] = self.runnumber
        document['startTime'] = self.startTime
        document['endTime'] = endtime
        documents = [document]
        self.index_documents('run',documents)

    def elasticize_box(self,infile):

        basename = infile.basename
        self.logger.debug(basename)
        try:
            document = infile.data
            #TODO:let dynamic ID
            document['id']= basename + '_' + document['fm_date'].split('.')[0] #TODO:remove
            documents = [document]
        except:
            #in case of malformed box info
            return
        self.index_documents('boxinfo',documents)
        #self.logger.info(str(document))#check that ID is not present...
        #TODO:write unique boxinfo
        #documents[0]['id']=basename
        #self.index_documents('boxinfo_last',documents)

    def elasticize_eols(self,infile):
        basename = infile.basename
        self.logger.info(basename)
        data = infile.data['data']
        data.append(infile.mtime)
        data.append(infile.ls[2:])
        
        values = [int(f) if f.isdigit() else str(f) for f in data]
        keys = ["NEvents","NFiles","TotalEvents","fm_date","ls"]
        document = dict(zip(keys, values))

        document['id'] = infile.name+"_"+os.uname()[1]
        document['_parent']= self.runnumber
        documents = [document]
        self.index_documents('eols',documents)

    def elasticize_minimerge(self,infile):
        basename = infile.basename
        self.logger.info(basename)
        data = infile.data['data']
        data.append(infile.mtime)
        data.append(infile.ls[2:])
        stream=infile.stream
        if stream.startswith("stream"): stream = stream[6:]
        data.append(stream)
        values = [int(f) if str(f).isdigit() else str(f) for f in data]
        keys = ["processed","accepted","errorEvents","fname","size","eolField1","eolField2","fm_date","ls","stream"]
        document = dict(zip(keys, values))
        document['id'] = infile.name
        document['_parent']= self.runnumber
        documents = [document]
        self.index_documents('minimerge',documents)

    def index_documents(self,name,documents):
        attempts=0
        while True:
            attempts+=1
            try:
                self.es.bulk_index(self.index_name,name,documents)
                return True
            except ElasticHttpError as ex:
                if attempts<=1:continue
                self.logger.error('elasticsearch HTTP error. skipping document '+name)
                #self.logger.exception(ex)
                return False
            except (ConnectionError,Timeout) as ex:
                if attempts>100 and self.runMode:
                    raise(ex)
                self.logger.error('elasticsearch connection error. retry.')
                if self.stopping:return False
                time.sleep(0.1)
                ip_url=getURLwithIP(self.es_server_url)
                self.es = ElasticSearch(ip_url)
        return False
Пример #6
0
    my_mapping = mappings.central_runindex_mapping
if index_name.startswith('boxinfo'):
    my_settings = mappings.central_es_settings,
    my_mapping = mappings.central_boxinfo_mapping
if index_name.startswith('hltdlogs'):
    my_settings = mappings.central_es_settings_hltlogs
    my_mapping = mappings.central_hltdlogs_mapping

#alias convention
alias_write = index_name + "_write"
alias_read = index_name + "_read"

if command == 'create':
    es.create_index(index_name,
                    settings={
                        'settings': my_settings,
                        'mappings': my_mapping
                    })

if command == 'alias':

    try:
        target_index = sys.argv[4]
    except:
        target_index = index_name

    #check if alias exists
    status1 = requests.get(server_url + '/_alias/' + alias_write).status_code
    status2 = requests.get(server_url + '/_alias/' + alias_read).status_code
    aliases_settings = {"actions": []}
    if status1 != 200:
Пример #7
0
if index_name.startswith('runindex'):
  my_settings = mappings.central_es_settings_runindex
  my_mapping = mappings.central_runindex_mapping
if index_name.startswith('boxinfo'):
  my_settings = mappings.central_es_settings_boxinfo,
  my_mapping = mappings.central_boxinfo_mapping
if index_name.startswith('hltdlogs'):
  my_settings = mappings.central_es_settings_hltlogs
  my_mapping = mappings.central_hltdlogs_mapping

#alias convention
alias_write=index_name+"_write"
alias_read=index_name+"_read"

if command=='create':
  es.create_index(index_name, settings={ 'settings': my_settings, 'mappings': my_mapping })

if command=='alias':

  try:
    target_index = sys.argv[4]
  except:
    target_index = index_name

  #check if alias exists
  alias_settings={}
  status1 = requests.get(server_url+'/_alias/'+alias_write).status_code
  status2 = requests.get(server_url+'/_alias/'+alias_read).status_code
  aliases_settings = { "actions": []}
  if status1!=200:
    alias_settings["actions"].append({"add": {"index": target_index, "alias": alias_write}})