def create_es_index(index_name): es = Elasticsearch() client = IndicesClient(es) # take this opportunity to create training index if it doesn't exist if not client.exists('appcompat-training'): client.create(index='appcompat-training', body=CONFIG) if client.exists(index_name): raise Exception('Index already exists: {}'.format(index_name)) client.create(index=index_name, body=CONFIG)
def _create_index(self): es_index = IndicesClient(self._es) if es_index.exists(self._store_index): logging.info('Index ' + self._store_index + ' already exists. Skipping index creation.') return None es_mapping = { "mappings": { 'last_runtime': { 'properties': { 'plugin_name': { 'index': 'not_analyzed', 'type': 'string' }, 'rule_name': { 'index': 'not_analyzed', 'type': 'string' }, 'plugin_sid': { 'index': 'not_analyzed', 'type': 'long' }, '@timestamp': { 'format': 'dateOptionalTime||epoch_millis', 'type': 'date' } } } } } self._es.indices.create(self._store_index, body=es_mapping) time.sleep(1)
class TestSingleDocSigTerms(TestCase): def setUp(self): super(TestSingleDocSigTerms, self).setUp() self.es = Elasticsearch( hosts=['localhost:%d' % es_runner.es_state.port]) self.ic = IndicesClient(self.es) self.index = 'single_doc_sigterms_test' self.doc_type = 'test-doc' self.field = 'text' if self.ic.exists(self.index): self.ic.delete(self.index) self.ic.create(self.index) self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1') def test_tf_for_doc_id(self): sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None) resp = dict(sigterms.tf_for_doc_id('doc_1')) self.assertEquals(4, len(resp)) self.assertEquals(3, resp['foo']) self.assertEquals(2, resp['knark']) self.assertEquals(1, resp['ba']) self.assertEquals(1, resp['knirk'])
class TestSingleDocSigTerms(TestCase): def setUp(self): super(TestSingleDocSigTerms, self).setUp() self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port]) self.ic = IndicesClient(self.es) self.index = 'single_doc_sigterms_test' self.doc_type = 'test-doc' self.field = 'text' if self.ic.exists(self.index): self.ic.delete(self.index) self.ic.create(self.index) self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1') def test_tf_for_doc_id(self): sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None) resp = dict(sigterms.tf_for_doc_id('doc_1')) self.assertEquals(4, len(resp)) self.assertEquals(3, resp['foo']) self.assertEquals(2, resp['knark']) self.assertEquals(1, resp['ba']) self.assertEquals(1, resp['knirk'])
class IndexBase: def __init__(self, **kwargs): self.index = kwargs.pop('index') self.client = client_es self.client_index = IndicesClient(self.client) if kwargs.get('settings') self.settings = kwargs.pop('settings') else: self.settings = DEFAULT_SETTINGS if self.exist_index(): self.delete_index() self.create_index() else: self.create_index() def exist_index(self): return self.client_index.exists(index=self.index) def delete_index(self): return self.client_index.delete(index=self.index, ignore=[400, 404]) def create_index(self): return self.client_index.create(index=self.index, body=self.settings)
def handle(self, *args, **options): es = Elasticsearch(hosts=[{'host': 'localhost', 'port': 9200}]) fop=open('spider/management/commands/'+str(argv[2]), 'r') inds = IndicesClient(es) mapping={ "mappings": { "product_type": { "properties": { "code": { "type" : "string" },"name": {"type" : "string"},"img": {"type" : "string"},"url": {"type" : "string"},"price_reg": {"type" : "float"},"price_discount": {"type" : "float"}}}}} if not inds.exists(index='gearbest_index'): inds.create(index='gearbest_index',body=mapping) print 'gearbest_index created' for jsonline in fop: jobj=loads(jsonline) del jobj["_type"] es.index(index="gearbest_index",doc_type='product_type', body=jobj, id=jobj['code']) disc=0 reg=0 if len(jobj['price_discount'])>0: disc = float(jobj['price_discount'][0]) if len(jobj['price_reg'])>0: reg = float(jobj['price_reg'][0]) #insert="INSERT into 'price_gb' ('price','price_disc','code','date') values ("+str(reg)+", "+str(disc)+", '"+str(jobj['code'])+"', '"+str(datetime.today())+"')" #cursor = connection.cursor() #cursor.execute(insert) add_price=Price_gb(price=reg,price_disc=disc,code=str(jobj['code']),date=datetime.date.today()) add_price.save() print 'code='+str(jobj['code'])
def import_examples_into_es(examples: list): index_name = config.index_name type_name = config.type_name buck_size = config.buck_size es = Elasticsearch(config.es_url) es_index = IndicesClient(es) if es_index.exists(index=index_name): es_index.delete(index=index_name) # 创建索引 with open(config.es_index_json) as f: mappings = json.load(f) res = es.indices.create(index=index_name, body=mappings) # 数据批量导入es for i in range(len(examples)): examples[i] = { "_index": index_name, "_type": type_name, "_id": examples[i]["ntc_id"], "_source": examples[i] } for i in tqdm(range(ceil(len(examples) / buck_size)), desc="Import into ES"): bulk(es, actions=examples[i * buck_size: min((i + 1) * buck_size, len(examples))])
def create_index_conf(): indices_client = IndicesClient(models.client) index_name = 'conf' doc_type = index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name)
def create_index_survey(): indices_client = IndicesClient(models.client) index_name = models.SurveyMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.ScentemotionMap) # add qstfld fields es_mapping = models.SurveyMap._meta.es_mapping for qst, mapping in survey.qst2fld.items(): fields = mapping[0] field_type = mapping[1] if field_type == 'nested_qst_ans': for field in fields: if field not in es_mapping['properties']: es_mapping['properties'][field] = {} es_mapping['properties'][field]['type'] = 'nested' es_mapping['properties'][field]['properties'] = {} es_mapping['properties'][field]['properties']['question'] = {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}} es_mapping['properties'][field]['properties']['answer'] = {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}} #'type' : 'nested', #'properties' : { # 'question' : {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}}, # 'answer' : {'type' : 'text', 'fields' : {'keyword' : {'type' : 'keyword', 'ignore_above' : 256}}}, # } #}, indices_client.put_mapping( doc_type=models.SurveyMap._meta.es_type_name, #body=models.SurveyMap._meta.es_mapping, body=es_mapping, index=index_name )
def _reset_mapping(self, mapping_path): esi = IndicesClient(es.get_es_handle()) index = settings.ES_INDEX if not esi.exists(index): raise CommandError("Non existing index : %s"%index) self.stdout.write(str(esi.delete(index=index)))
def create_index_excel(excel_filename): indices_client = IndicesClient(models.client) index_name = 'excel' if len(excel_filename): doc_type = os.path.splitext(excel_filename)[0] index_name = 'excel_' + doc_type if indices_client.exists(index_name): indices_client.delete(index=index_name)
def recreate_index(self, index_name, index_mapping): indices_client = IndicesClient(client=ES_CLIENT) if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(doc_type='page', index=index_name, body=index_mapping)
def create_index(): indices_client = IndicesClient(client=settings.ES) index_name = Apartments._meta.es_index_name if not indices_client.exists(index_name): indices_client.create(index=index_name) indices_client.put_mapping(doc_type=Apartments._meta.es_type_name, body=Apartments._meta.es_mapping, index=index_name)
def create_index_mi(): indices_client = IndicesClient(models.client) index_name = models.PostMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(body=models.PostMap._meta.es_mapping, index=index_name)
def _create_main_index_if_not_exists(self): """ method that creates new elastic index if not existed :return: """ ic = IndicesClient(self.es) if not ic.exists(MAIN_INDEX_NAME): ic.create(MAIN_INDEX_NAME)
def create_index_if_not_exists(self): """ Check if index exists & if not exists create index & types & store their mappings. """ ic = IndicesClient(self.es) response = ic.exists(index=[self.index_name]) if not response: es_mappings = ElasticSearchController.get_index_mapper_dict() index_response = ic.create(index=self.index_name, body={"mappings": es_mappings})
def remover_indice(nome_indice): """Remove o indice do Elasticsearch. O indice de elasticsearch é análogo a uma tabela em um SGBD. """ es = conectar_em_elastic_search() client_indice = IndicesClient(es) if client_indice.exists(index=[nome_indice]): client_indice.delete(nome_indice)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = Student._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(doc_type=Student._meta.es_type_name, body=Student._meta.es_mapping, index=index_name)
def create_index_pi(): # indices_client = IndicesClient(client=settings.ES_HOSTS) indices_client = IndicesClient(models.client) index_name = models.Review._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(body=models.Review._meta.es_mapping, index=index_name)
def create_index_if_not_exists(self): """ Check if index exists & if not exists create index & types & store their mappings. """ ic = IndicesClient(self.es) response = ic.exists(index=[self.index_name]) if not response: es_mappings = ElasticSearchController.get_index_mapper_dict() index_response = ic.create(index=self.index_name, body={ "mappings":es_mappings })
def create_index_bestmatch(): indices_client = IndicesClient(models.client) index_name = models.bestmatchMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.bestmatchMap) indices_client.put_mapping(body=models.bestmatchMap._meta.es_mapping, index=index_name)
def create_index_si_sites(): indices_client = IndicesClient(models.client) index_name = models.PageMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping(doc_type=models.PageMap._meta.es_type_name, body=models.PageMap._meta.es_mapping, index=index_name)
def create_index_dhk(): indices_client = IndicesClient(models.client) index_name = 'recipes' if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping( # ES7.0 does not support types anymore doc_type=index_name, body={'properties': wb_excel.recipes}, index=index_name)
def initialize(self, idx): es_index, es_doctype = self.indexinfo(idx) self.logger.info("Initializing %s" % es_index) idx_client = IndicesClient(self.es) if idx_client.exists(es_index): idx_client.delete(es_index) idx_client.create(es_index) if idx == 'event': idx_client.put_mapping(doc_type=es_doctype, index=[es_index], body=event_mapping()) self.logger.info("%s ready." % es_index)
def create_index_mi_feedly(): indices_client = IndicesClient(models.client) index_name = models.FeedlyMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.FeedlyMap) indices_client.put_mapping(doc_type=models.FeedlyMap._meta.es_type_name, body=models.FeedlyMap._meta.es_mapping, index=index_name)
def create_index_survey(): indices_client = IndicesClient(models.client) index_name = models.SurveyMap._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) #put_settings(models.ScentemotionMap) indices_client.put_mapping(doc_type=models.SurveyMap._meta.es_type_name, body=models.SurveyMap._meta.es_mapping, index=index_name)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name, body=es_ind_settings) for model_name in es_models: indices_client.put_mapping( doc_type=model_es_indices[model_name]['type'], body=es_mappings[model_name], index=es_index_name)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = Student._meta.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name) indices_client.put_mapping( doc_type=Student._meta.es_type_name, body=Student._meta.es_mapping, index=index_name )
def __createIndex(self): es = Elasticsearch([{'host': self.elasticsearch_host, 'port': self.elasticsearch_port}]) ic = IndicesClient(es) if(ic.exists(index='wow')): print("deleting old index") self.deleteIndex() ic.create(index='wow') # blah = glob.glob(os.path.join(self.map_directory, '*')) for currentFile in glob.glob(os.path.join(self.map_directory, '*')): print("MAP FILE: " + currentFile) self.__mapFile(currentFile)
def _remove_index_if_exists(): es = elasticsearch.Elasticsearch() from elasticsearch.client import IndicesClient es_index = IndicesClient(es) if es_index.exists(STORAGE_INDEX_NAME): logger.info("Elasticsearch index '{0}' already exists and " "will be deleted".format(STORAGE_INDEX_NAME)) try: es_index.delete(STORAGE_INDEX_NAME) logger.info('Verifying Elasticsearch index was deleted...') deadline = time.time() + 45 while es_index.exists(STORAGE_INDEX_NAME): if time.time() > deadline: raise RuntimeError( 'Elasticsearch index was not deleted after ' '30 seconds') time.sleep(0.5) except BaseException as e: logger.warn('Ignoring caught exception on Elasticsearch delete' ' index - {0}: {1}'.format(e.__class__, e.message))
def status(self): idx_client = IndicesClient(self.es) for idx in ['raw-article', 'enhanced-article']: es_index = self.indexinfo(idx)[0] if idx_client.exists(es_index): self.logger.info("%s contains %s documents." % (idx, self.es.count(index=es_index)['count'])) if idx == 'article': query = {"query": {"term": {"status": 1}}} self.logger.info( "%s articles have been processed." % self.es.count(index=es_index, body=query)['count']) else: self.logger.info("%s does not exist" % es_index)
def recreate_index(self): indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = self.es_index_name if indices_client.exists(index_name): indices_client.delete(index=index_name) indices_client.create(index=index_name, body=self.es_ind_settings) ## create mapping for one model only for now model_name = 'place' indices_client.put_mapping( doc_type=model_es_indices[model_name]['type'], body=es_mappings[model_name], index=index_name)
def create_parcel_mapping(): idx_client = IndicesClient(es) if not idx_client.exists(index=parcel_index): idx_client.create(index=parcel_index) with open('osc\util\mappings\parcel.json') as mapping_file: mapping = json.load(mapping_file) idx_client.put_mapping(doc_type=parcel_mapping, index=[parcel_index], body=mapping)
def _remove_index_if_exists(): es = elasticsearch.Elasticsearch() from elasticsearch.client import IndicesClient es_index = IndicesClient(es) if es_index.exists(STORAGE_INDEX_NAME): logger.info( "Elasticsearch index '{0}' already exists and " "will be deleted".format(STORAGE_INDEX_NAME)) try: es_index.delete(STORAGE_INDEX_NAME) logger.info('Verifying Elasticsearch index was deleted...') deadline = time.time() + 45 while es_index.exists(STORAGE_INDEX_NAME): if time.time() > deadline: raise RuntimeError( 'Elasticsearch index was not deleted after ' '30 seconds') time.sleep(0.5) except BaseException as e: logger.warn('Ignoring caught exception on Elasticsearch delete' ' index - {0}: {1}'.format(e.__class__, e.message))
def recreateIndex(self): """function to recreate the index in tge elasticsearch""" print("delete the previous index and creating th new one...") indices_client = IndicesClient(client=settings.ES_CLIENT) index_name = Product._meta.es_index_name type_type = Product._meta.es_type_name if indices_client.exists(index=index_name): indices_client.delete(index=index_name) indices_client.create(index_name) indices_client.put_mapping(doc_type=Product._meta.es_type_name, body=Product._meta.es_mapping, index=index_name)
def remove_log_indices(): es = elasticsearch.Elasticsearch() from elasticsearch.client import IndicesClient es_index = IndicesClient(es) log_index_pattern = '{0}*'.format(LOG_INDICES_PREFIX) if es_index.exists(log_index_pattern): logger.info("Elasticsearch indices '{0}' already exist and " "will be deleted".format(log_index_pattern)) try: es_index.delete(log_index_pattern) logger.info('Verifying Elasticsearch index was deleted...') deadline = time.time() + 45 while es_index.exists(log_index_pattern): if time.time() > deadline: raise RuntimeError( 'Elasticsearch index was not deleted after ' '30 seconds') time.sleep(0.5) except BaseException as e: logger.warn('Ignoring caught exception on Elasticsearch delete' ' index - {0}: {1}'.format(e.__class__, e.message))
def remove_log_indices(): es = elasticsearch.Elasticsearch() from elasticsearch.client import IndicesClient es_index = IndicesClient(es) log_index_pattern = '{0}*'.format(LOG_INDICES_PREFIX) if es_index.exists(log_index_pattern): logger.info( "Elasticsearch indices '{0}' already exist and " "will be deleted".format(log_index_pattern)) try: es_index.delete(log_index_pattern) logger.info('Verifying Elasticsearch index was deleted...') deadline = time.time() + 45 while es_index.exists(log_index_pattern): if time.time() > deadline: raise RuntimeError( 'Elasticsearch index was not deleted after ' '30 seconds') time.sleep(0.5) except BaseException as e: logger.warn('Ignoring caught exception on Elasticsearch delete' ' index - {0}: {1}'.format(e.__class__, e.message))
def get_index(self, index=INDEX): """ get the index status :param index: :return: True or False """ try: indexcli = IndicesClient(self.es) index_status = indexcli.exists(index=index) # LOG.info('Get index status successful ,index status is {}'.format(index_status)) return index_status except Exception as e: LOG.error('Get index status failed ,cause {}'.format(e))
def createIndex(es, indexName, settingsFile, mappingFile, delete=False): iclient = IndicesClient(es) #If specified, delete any existing index with the same name if delete and iclient.exists(indexName): iclient.delete(indexName) #else only create it if it does not exist if not iclient.exists(indexName): #Load the settings and mapping f = open(settingsFile) settings = json.load(f) f.close() f = open(mappingFile) mapping = json.load(f) f.close() #Create the index with the settings and mapping iclient.create(indexName, { 'settings': settings, 'mappings': mapping })
class RedisEsSetupMixin(object): def setUp(self): self.settings = TEST_SETTINGS_OBJECT self.es = get_es(self.settings) self.esi = IndicesClient(self.es) self.index = self.settings.get("ES_INDEX") #create the index firstly if self.esi.exists(self.index): self.esi.delete(index=self.index) self.esi.create(index=self.index) mapping_path = os.path.join(SCRAPY_ROOT, "resources/mappings.json") mapping_str = open(mapping_path, "r").read() mappings = json.loads(mapping_str) for k,v in mappings.iteritems(): res = self.esi.put_mapping(self.index, k, {k:mappings[k]}) #print res self.redis_conn = get_redis(self.settings) def tearDown(self): if self.esi.exists(self.index): self.esi.delete(index=self.index) print "ES INDEX DELETED" #remove redis stuff self.redis_conn.flushdb() print "REDIS DB DELETED"
def initialize(self, conf, context): host = conf.get('zeit.recommend.elasticsearch.host', 'localhost') port = conf.get('zeit.recommend.elasticsearch.port', 9200) self.es = Elasticsearch(hosts=[{'host': host, 'port': port}]) self.match = re.compile('seite-[0-9]|komplettansicht').match self.index = '%s-%s' % date.today().isocalendar()[:2] ic = IndicesClient(self.es) try: if not ic.exists(self.index): ic.create(self.index) except ConnectionError, e: log('[UserIndexBolt] ConnectionError, index unreachable: %s' % e) return
def _create_weight_index(es, index): """ Creates the index with the right mapping if it doesn't exist. :param es: :type es:elasticsearch.Elasticsearch :param index: :type index:str|unicode """ ic = IndicesClient(es) if ic.exists(index): logging.info('Index %s already exists ...' % index) else: ic.create(index=index, body=ES_TERMWEIGHTING_INDEX_SETTINGS)
def import_ontology(ontology: lib.obo.Ontology, index_name: str): es = elasticsearch.Elasticsearch() ies = IndicesClient(es) actions = [dict( _index=index_name, _type=index_name, _source=dict( id=item.id, names=item.names() ) ) for item in ontology.items()] if ies.exists(index_name): ies.delete(index_name) ies.create(index_name) return bulk(es, actions=actions)
def _init_mapping(self, mapping_path): esi = IndicesClient(es.get_es_handle()) index = settings.ES_INDEX #first create index if not exists if not esi.exists(index): self.stdout.write("Creating index for db : %s"%index) esi.create(index=index) self.stdout.write("Index Created for : %s"%index) if not mapping_path or not os.path.exists(mapping_path): raise CommandError("not existing mapping path") mapping_str = open(mapping_path, "r").read() mappings = json.loads(mapping_str) for k,v in mappings.iteritems(): res = esi.put_mapping(index, k, {k:mappings[k]}) self.stdout.write(str(res))
def setup(forced): properties = {} properties["fail_symptom"] = {"type" : "string", "index": "not_analyzed"} properties["ats_log"] = {"type" : "string"} properties["file_path"] = {"type" : "string", "analyzer": "path-analyzer"} add_unique_mapping(properties, "Test Start Time", {"VALUE" : {"type" : "date", "format": "yyyy/MM/dd HH:mm:ssZ||yyyy/MM/ddZ"}}) add_unique_mapping(properties, "Test end Time", {"VALUE" : {"type" : "date", "format": "yyyy/MM/dd HH:mm:ssZ||yyyy/MM/ddZ"}}) es = Elasticsearch([{'host': 'localhost', 'port': 9200}], max_retries=10, retry_on_timeout=True) idx_client = IndicesClient(es) if (idx_client.exists(index=PROJECT)): if (forced): idx_client.delete(index=PROJECT) else : print "Index already exists!" return runin_csv_status = {"runin_csv_status" : {"path_match": "RunInLog.*.STATUS", "mapping": {"index": "not_analyzed"}}} runin_csv_value = {"runin_csv_value" : {"path_match": "RunInLog.*.VALUE", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} runin_csv_u_limit = {"runin_csv_u_limit" : {"path_match": "RunInLog.*.U_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} runin_csv_l_limit = {"runin_csv_l_limit" : {"path_match": "RunInLog.*.L_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} runin_csv_test_time = {"runin_csv_test_time" : {"path_match": "RunInLog.*.TEST_TIME", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_status = {"csv_status" : {"path_match": "*.STATUS", "mapping": {"index": "not_analyzed"}}} csv_value = {"csv_value" : {"path_match": "*.VALUE", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_u_limit = {"csv_u_limit" : {"path_match": "*.U_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_l_limit = {"csv_l_limit" : {"path_match": "*.L_LIMIT", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} csv_test_time = {"csv_test_time" : {"path_match": "*.TEST_TIME", "mapping": {"index": "not_analyzed", "fields" : {"double" : {"type" : "double"}}}}} dynamic_templates = [runin_csv_status, runin_csv_value, runin_csv_u_limit, runin_csv_l_limit, runin_csv_test_time, csv_status, csv_value, csv_u_limit, csv_l_limit, csv_test_time] analysis = {} analysis["analyzer"] = {} analysis["tokenizer"] = {} analysis["analyzer"]["path-analyzer"] = {"type": "custom", "tokenizer": "path-tokenizer"} analysis["tokenizer"]["path-tokenizer"] = {"type": "path_hierarchy"} mappings = {"dynamic_templates" : dynamic_templates, "properties" : properties} data = {"settings" : {"index.mapping.ignore_malformed": True, "number_of_replicas": 1, "analysis": analysis}, "mappings" : {STAGE: mappings}} print json.dumps(data) idx_client.create(index=PROJECT, body=data)
class ElasticSearchEngine(object): ''' ElasticSearch Engine. ''' # make it compatible with services LOAD_PRIORITY = 30 def __init__(self, index, host=None, port=None): '''Only one host for now.''' if not es_installed: raise ValueError('elasticsearch not installed') assert(index.isalpha()) self.init_state(index, host, port) def init_state(self, index, host, port): self._queue = [] self.index = index self.host = host self.port = port if host is None: self.es = Elasticsearch() else: self.es = Elasticsearch(hosts=[{'host': host, 'port': port}]) self.idx_manager = IndicesClient(self.es) self.mapper = ESQueryMapper() # be persistence friendly def __getstate__(self): return (self.index, self.host, self.port) def __setstate__(self, state): self.init_state(*state) def _index(self, document, update=False): # for efficiency, nothing is executed yet, # we prepare and queue the operation cursor = IndexCursor(self.index) document.save(cursor, update) cursor.enqueue(self._queue) def add_document(self, document): ''' Add a document to the data store, in index (a.k.a. collection), under the document type. ''' self._index(document) def delete_document(self, schema, docid): ''' Remove document from index and storage. ''' op = { '_op_type': 'delete', '_index': self.index, '_type': schema.type_name, '_id': docid } self._queue.append(op) def update_document(self, document): '''Update document (partial update from delta document)''' self._index(document, True) def commit(self, sync=False): ''' If ``sync``, index synchronously, else let Elasticsearch manage its index. ''' helpers.bulk(self.es, self._queue) if sync: self.idx_manager.refresh(self.index) self._queue = [] def cancel(self): ''' Forget operation scheduled since last commit''' self._queue = [] def search(self, query, size=20): ''' Search the database. ''' index_cursor = IndexCursor(self.index, self.es.search) return query.search(index_cursor, self.mapper, size) def delete_collection(self): if self.idx_manager.exists(self.index): self.idx_manager.delete(index=self.index) def create_collection(self, schemas): ''' Init the collections the first time. Just use once! Or you'll have to reindex all your documents. `schemas` is a list of Document classes or Schema instances. ''' idx_manager = self.idx_manager if idx_manager.exists(self.index): idx_manager.delete(index=self.index) mapper = ESSchemaMapper(idx_manager) for schema in schemas: schema.map(mapper) mapper.create(self.index)
class IndicesManager(object): def __init__(self, options=None): self.options = options or {} self.es = get_elasticsearch(self.options) self.esc = IndicesClient(self.es) self.conf_dir = sys.path[0] def __create__(self, name, config=None, type=None): result = None try: if not config: file_name = "{}/config/{}_index.json".format( self.conf_dir, type) with open(file_name) as fp: config = fp.read() # create the index with version number result = self.esc.create(index=name, body=config) except es_exceptions.TransportError: print("unable to connect to Elasticsearch") return result def create(self, doc_type): alias_name = 'frisc_{}'.format(doc_type) index_name = '{}_v1'.format(alias_name) try: if self.esc.exists_alias(alias_name): print('Index {} already existst, updating'.format(alias_name)) self.update(doc_type) return self.__create__(index_name, type=doc_type) # set an alias to the index self.esc.put_alias(index=index_name, name=alias_name) except es_exceptions.TransportError: print("unable to connect to Elasticsearch") def update(self, doc_type): alias_name = 'frisc_{}'.format(doc_type) index_name = '{}_v1'.format(alias_name), try: if not self.esc.exists_alias(alias_name): self.create(doc_type) return version_number = 0 old_index_name = '' old_indexes = self.esc.get_alias(name=alias_name) for index in old_indexes.keys(): match = re.search('^({})_v(\d+)$'.format(alias_name), index) if match: version = int(match.group(2)) if version > version_number: version_number = version old_index_name = match.group(0) version_number += 1 index_name = '{}_v{}'.format(alias_name, version_number) if self.esc.exists(index_name): # raise soemthing raise self.__create__(index_name, type=doc_type) reindex(self.es, old_index_name, index_name) self.esc.update_aliases( body={'actions': [ {'remove': {'alias': alias_name, 'index': old_index_name}}, {'add': {'alias': alias_name, 'index': index_name}} ]} ) except es_exceptions.TransportError: print("unable to connect to Elasticsearch")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--host', help='Elasticsearch host') parser.add_argument('--port', type=int, help='Elasticsearch port') parser.add_argument('--url-prefix', help='Elasticsearch URL prefix') parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth') parser.add_argument('--ssl', action='store_true', default=None, help='Use TLS') parser.add_argument('--no-ssl', dest='ssl', action='store_false', help='Do not use TLS') parser.add_argument('--verify-certs', action='store_true', default=None, help='Verify TLS certificates') parser.add_argument('--no-verify-certs', dest='verify_certs', action='store_false', help='Do not verify TLS certificates') parser.add_argument('--index', help='Index name to create') parser.add_argument('--old-index', help='Old index name to copy') parser.add_argument('--send_get_body_as', default='GET', help='Method for querying Elasticsearch - POST, GET or source') parser.add_argument('--boto-profile', default=None, help='Boto profile to use for signing requests') parser.add_argument('--aws-region', default=None, help='AWS Region to use for signing requests') args = parser.parse_args() if os.path.isfile('../config.yaml'): filename = '../config.yaml' elif os.path.isfile('config.yaml'): filename = 'config.yaml' else: filename = '' if filename: with open(filename) as config_file: data = yaml.load(config_file) host = args.host if args.host else data.get('es_host') port = args.port if args.port else data.get('es_port') username = data.get('es_username') password = data.get('es_password') url_prefix = args.url_prefix if args.url_prefix is not None else data.get('es_url_prefix', '') use_ssl = args.ssl if args.ssl is not None else data.get('use_ssl') verify_certs = args.verify_certs if args.verify_certs is not None else data.get('verify_certs') is not False aws_region = data.get('aws_region', None) send_get_body_as = data.get('send_get_body_as', 'GET') else: username = None password = None aws_region = args.aws_region host = args.host if args.host else raw_input('Enter elasticsearch host: ') port = args.port if args.port else int(raw_input('Enter elasticsearch port: ')) use_ssl = (args.ssl if args.ssl is not None else raw_input('Use SSL? t/f: ').lower() in ('t', 'true')) if use_ssl: verify_certs = (args.verify_certs if args.verify_certs is not None else raw_input('Verify TLS certificates? t/f: ').lower() not in ('f', 'false')) else: verify_certs = True if args.no_auth is None: username = raw_input('Enter optional basic-auth username (or leave blank): ') password = getpass.getpass('Enter optional basic-auth password (or leave blank): ') url_prefix = (args.url_prefix if args.url_prefix is not None else raw_input('Enter optional Elasticsearch URL prefix (prepends a string to the URL of every request): ')) send_get_body_as = args.send_get_body_as auth = Auth() http_auth = auth(host=host, username=username, password=password, aws_region=aws_region, boto_profile=args.boto_profile) es = Elasticsearch( host=host, port=port, use_ssl=use_ssl, verify_certs=verify_certs, connection_class=RequestsHttpConnection, http_auth=http_auth, url_prefix=url_prefix, send_get_body_as=send_get_body_as) silence_mapping = {'silence': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'until': {'type': 'date', 'format': 'dateOptionalTime'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} ess_mapping = {'elastalert_status': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} es_mapping = {'elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}, 'alert_time': {'format': 'dateOptionalTime', 'type': 'date'}, 'match_body': {'enabled': False, 'type': 'object'}, 'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}} past_mapping = {'past_elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'match_body': {'enabled': False, 'type': 'object'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}, 'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}} error_mapping = {'elastalert_error': {'properties': {'data': {'type': 'object', 'enabled': False}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} index = args.index if args.index is not None else raw_input('New index name? (Default elastalert_status) ') if not index: index = 'elastalert_status' old_index = (args.old_index if args.old_index is not None else raw_input('Name of existing index to copy? (Default None) ')) es_index = IndicesClient(es) if es_index.exists(index): print('Index ' + index + ' already exists. Skipping index creation.') return None es.indices.create(index) # To avoid a race condition. TODO: replace this with a real check time.sleep(2) es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) es.indices.put_mapping(index=index, doc_type='past_elastalert', body=past_mapping) print('New index %s created' % index) if old_index: print("Copying all data from old index '{0}' to new index '{1}'".format(old_index, index)) # Use the defaults for chunk_size, scroll, scan_kwargs, and bulk_kwargs elasticsearch.helpers.reindex(es, old_index, index) print('Done!')
def main(index_num): n_out = int(10e6) n_batch = int(4e3) n_batches = n_out // n_batch index = 'image_hashes_%02d' % index_num client = Elasticsearch('localhost:9200') index_client = IndicesClient(client) if index_client.exists(index): print('Not deleting %s!' % index); return; sys.exit(1) index_client.delete(index) es_short = { 'type': 'short', } field_name = lambda i: '%x' % i fields = {field_name(i): es_short for i in range(n_samples)} fields['raw'] = { 'type': 'string', 'store': True, 'index': 'not_analyzed', 'doc_values': True } index_client.create(index=index, body={ 'settings': { 'number_of_shards': 4, 'number_of_replicas': 0 }, 'mappings': { 'images': { '_source': {'enabled': False}, 'properties': fields } } }) sampler, pow2 = get_sampler(n_samples, b_p_sample) start_time = time.time() for i_batch in range(1, n_batches+1): data = np.random.randn(n_batch, dim_in) hash = (data.dot(proj) > 0).astype(np.uint64) hash_int = hash.dot(2**np.arange(dim_out).astype(np.uint64)) #print('\n'.join(repr(i.astype(np.uint8)) for i in hash)); return sampled = np.vstack( hash.dot(sampler[:,:,j]).dot(pow2) for j in range(n_samples) ).astype(np.int16).T.tolist() #print(repr(sampled)); print(repr([len(sampled), len(sampled[0])])); return docs = [] for i in range(n_batch): doc = { field_name(j): sampled[i][j] for j in range(n_samples) } doc['raw'] = '{0:064b}'.format(hash_int[i]) doc_id = random.getrandbits(63) docs.append('{"index":{"_index": "%s", "_type": "images", "_id": "%d"}})' % (index, doc_id)) docs.append(json.dumps(doc)) #print(json.dumps(json.loads(docs[1]), indent=4)); return try: response = client.bulk(body='\n'.join(docs)) except: # Even when an exception is thrown typically documents were stored in ES sleep_seconds = 10 print('\rHTTP timed out, sleeping %d seconds...' % sleep_seconds) time.sleep(sleep_seconds) print('\rChunk %5d/%d, %5.2f%%' % (i_batch, n_batches, i_batch*100.0/n_batches), end='') index_time = time.time() print('\nCalling optimize, indexing took %.1f s...' % (index_time - start_time)) sys.stdout.flush() index_client.optimize(index=index, max_num_segments=3, request_timeout=1e6) print('Optimization done in %.1f s' % (time.time() - index_time))
class ElasticSearchEngine(object): ''' ElasticSearch Engine. ''' def __init__(self, index, host=None, port=None): '''Only one host for now.''' assert(index.isalpha()) self.init_state(index, host, port) def init_state(self, index, host, port): self._queue = [] self.index = index self.host = host self.port = port if host is None: self.es = Elasticsearch() else: self.es = Elasticsearch(hosts=[{'host': host, 'port': port}]) self.idx_manager = IndicesClient(self.es) self.mapper = ESMapper() # be persistence friendly def __getstate__(self): return (self.index, self.host, self.port) def __setstate__(self, state): self.init_state(*state) def _index(self, document, update=False): # for efficiency, nothing is executed yet, # we prepare and queue the operation doc = 'doc' if update else '_source' op = { '_index': self.index, '_type': document.__class__.__name__, '_op_type': 'update' if update else 'create', '_id': document._id, doc: {k: getattr(document, k) for k in document.fields if getattr(document, k) is not None} } self._queue.append(op) def add_document(self, document): ''' Add a document to the data store, in index (a.k.a. collection), under the document type. ''' self._index(document) def delete_document(self, doctype, docid): ''' Remove document from index and storage. ''' op = { '_op_type': 'delete', '_index': self.index, '_type': doctype.__name__, '_id': docid } self._queue.append(op) def update_document(self, document): '''Update document (partial update from delta document)''' self._index(document, True) def commit(self, sync=False): ''' If ``sync``, index synchronously, else let Elasticsearch manage its index. ''' helpers.bulk(self.es, self._queue) if sync: self.idx_manager.refresh(self.index) self._queue = [] def cancel(self): ''' Forget operation scheduled since last commit''' self._queue = [] def search(self, query, size=20): ''' Search the database. ''' dsl = query(self.mapper) hits = self.es.search(index=self.index, doc_type=query.queried_doc.__name__, body={'query': dsl}, size=size) res = [ (h['_score'], query.queried_doc.delta(h['_id'], **h['_source'])) for h in hits['hits']['hits'] ] return res def delete_collection(self): if self.idx_manager.exists(self.index): self.idx_manager.delete(index=self.index) def create_collection(self, schema): ''' Init the collections the first time. Just use once! Or you'll have to reindex all your documents. Schema is a list of Document classes. ''' idx_manager = self.idx_manager if idx_manager.exists(self.index): idx_manager.delete(index=self.index) mappings = {} for doctype in schema: properties = {'_full': {"type": "string", "index_analyzer": "autocomplete", "search_analyzer": "standard"}} excludes = [] for name, ftype in doctype.fields.iteritems(): properties[name] = ESProperty(ftype) if not ftype.stored: excludes.append(name) mappings[doctype.__name__] = {'properties': properties, '_source': {"excludes": excludes}} settings = { "number_of_shards": 1, "analysis": { "filter": { "autocomplete_filter": { "type": "edge_ngram", "min_gram": 1, "max_gram": 20 } }, "analyzer": { "autocomplete": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "autocomplete_filter" ] } } } } body = {"mappings": mappings, "settings": settings} idx_manager.create(index=self.index, body=body)
def aggs_error_count(topic_name, group_name, app_name, ip, time_scope=1): index_list = [] # 根据检索范围获取索引名称, 并验证索引是否存在, 并生成已经存在的索引列表 indicesClient = IndicesClient(app.es) for count in range((int(time_scope)/24)+1): index_name = 'kafka_msg_log_' + time.strftime('%Y.%m.%d', time.localtime(time.time() - int(count)*24*60*60)) if indicesClient.exists(index_name): index_list.append(index_name) if index_list.__len__() == 0: error_stat_result = { "xAxis": [], "send_error_list": [], "business_error_list": [], "success": "true", "group_name": group_name, "topic_name": topic_name, "app_name": app_name, "ip": ip } return json.dumps(error_stat_result, encoding='utf8', ensure_ascii=False, indent=2) start_time = "now-" + str(time_scope) + "h/h" range_dict = { "range" : { "timestamp" : { "gte" : start_time, "lte" : "now/h" } } } must_list = _assemble_must_terms(topic_name, group_name, app_name, ip) must_list.append(range_dict) res = app.es.search( index=index_list, body={ "from": 0, "size": 10000, "query": { "bool": { "must_not": { "missing": { "field": "etype" } }, "must": must_list } }, "fields": "etype", "aggregations": { "aggs": { "date_histogram": { "field": "timestamp", "interval": "10m", "format": "yyyy-MM-dd HH:mm", "time_zone": "+08:00", "min_doc_count": 0 }, "aggregations": { "etype": { "terms": { "field": "etype", "min_doc_count": 0, "size": 10000 }, "aggregations": { "etype_count": { "value_count": { "field": "etype" } } } } } } } } ) xAxis = set([]) error_stat_dict = {} for obj in res['aggregations']['aggs']['buckets']: date_time = obj['key_as_string'] xAxis.add(date_time) # 添加横坐标列表 etype_count_aggs = obj['etype']['buckets'] for etype_count_obj in etype_count_aggs: etype_count = etype_count_obj['etype_count']['value'] etype = etype_count_obj['key'] if date_time not in error_stat_dict: error_stat_dict[date_time] = [{"etype":etype, "count":etype_count}] else: temp_list = error_stat_dict[date_time] temp_list.append({"etype":etype, "count":etype_count}) error_stat_dict[date_time] = temp_list # error_stat_result.append(error_stat_dict) logger.debug('etype:[' + str(etype) + '] datetime: [' + date_time + '] count: [' + str(etype_count) + ']') xAxis = sorted(xAxis) # 发送异常数据集 send_error_list = [] # 业务异常数据集 business_error_list = [] for x_date_time in xAxis: if x_date_time in error_stat_dict: temp_etype_dict_list = error_stat_dict[x_date_time] if temp_etype_dict_list: for etype_dict in temp_etype_dict_list: if etype_dict['etype'] == 1: send_error_list.append(etype_dict['count']) else: business_error_list.append(etype_dict['count']) else: send_error_list.append('0') business_error_list.append('0') error_stat_result = { "xAxis": xAxis, "send_error_list": send_error_list, "business_error_list": business_error_list, "success": "true", "group_name": group_name, "topic_name": topic_name, "app_name": app_name, "ip": ip } return json.dumps(error_stat_result, encoding='utf8', ensure_ascii=False, indent=2)
'type': 'string', 'store': 'yes', 'index': 'not_analyzed' }, 'title': {'type': 'string'}, 'body': {'type': 'string'}, 'teaser': {'type': 'string'}, 'timestamp': {'type': 'date'} }, '_id': {'path': 'path'} } } ic = IndicesClient(es) if not ic.exists(index): ic.create(index) if not ic.exists_type(index=index, doc_type='item'): ic.put_mapping( index=index, ignore_conflicts=True, doc_type='item', body=body ) while 1: try: main() except KeyboardInterrupt: raise SystemExit(0)
def main(argv): es_server_addr = 'localhost' input_location = os.path.abspath(".") input_subfolder = None output_location = None fail_location = os.path.abspath(DEFAULT_FAIL_PATH) doctype = ES_DOC_TYPE is_looping = False try: opts, args = getopt.getopt(argv, "i:s:o:f:t:c:l", ["input_location=", "input_subfolder=", "output_location=", "fail_location=", "doc_type=", "config=" "loop"]) except getopt.GetoptError: usage() sys.exit(2) if len(args) < 1: usage() sys.exit(2) elif len(args) > 1: es_server_addr = args[1] project = args[0] for opt, arg in opts: if opt in ("-i", "--input_location"): input_location = os.path.abspath(arg) elif opt in ("-s", "--input_subfolder"): input_subfolder = arg elif opt in ("-o", "--output_location"): output_location = os.path.abspath(arg) elif opt in ("-f", "--fail_location"): fail_location = os.path.abspath(arg) elif opt in ("-t", "--doc_type"): doctype = arg elif opt in ("-c", "--config"): (input_location, input_subfolder, output_location, fail_location, doctype) = parse_config(arg, input_location, input_subfolder, output_location, fail_location, doctype) elif opt in ("-l", "--loop"): is_looping = True logger = logging.getLogger('mla_logger') logger.setLevel(logging.INFO) fh = logging.FileHandler(LOG_PREFIX + "_" + project + "_" + time.strftime("%Y%m%d-%H%M%S") + LOG_POSTFIX) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter('[%(asctime)s] [%(levelname)8s] %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("%s run with es_server_addr=%s, project=%s, input_location=%s, input_subfolder=%s, output_location=%s, fail_location=%s, doctype=%s, is_looping=%s" % (LOG_PREFIX, es_server_addr, project, input_location, input_subfolder, output_location, fail_location, doctype, is_looping)) print "Press Enter key to continue" a = raw_input() es = Elasticsearch([{'host': es_server_addr, 'port': 9200}], max_retries=10, retry_on_timeout=True) idx_client = IndicesClient(es) if (False == idx_client.exists(index=project)): logger.info("Index %s not exist, press enter key to create schema" % project) a = raw_input() mla_setup.schema_setup(es, project, False, logger) print "Press enter key to continue" a = raw_input() try: mla_import_loop(es, project, doctype, input_location, input_subfolder, output_location, fail_location, is_looping) except: msg = traceback.format_exc() print msg notify_mail(msg)
def create_index_mappings(es_client, ea_index, recreate=False, old_ea_index=None): esversion = es_client.info()["version"]["number"] print("Elastic Version: " + esversion) es_index_mappings = read_es_index_mappings() if is_atleastsix(esversion) else read_es_index_mappings(5) es_index = IndicesClient(es_client) if not recreate: if es_index.exists(ea_index): print('Index ' + ea_index + ' already exists. Skipping index creation.') return None # (Re-)Create indices. if is_atleastsix(esversion): index_names = ( ea_index, ea_index + '_status', ea_index + '_silence', ea_index + '_error', ea_index + '_past', ) else: index_names = ( ea_index, ) for index_name in index_names: if es_index.exists(index_name): print('Deleting index ' + index_name + '.') try: es_index.delete(index_name) except NotFoundError: # Why does this ever occur?? It shouldn't. But it does. pass es_index.create(index_name) # To avoid a race condition. TODO: replace this with a real check time.sleep(2) if is_atleastseven(esversion): # TODO remove doc_type completely when elasicsearch client allows doc_type=None # doc_type is a deprecated feature and will be completely removed in Elasicsearch 8 es_client.indices.put_mapping(index=ea_index, doc_type='_doc', body=es_index_mappings['elastalert'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_status', doc_type='_doc', body=es_index_mappings['elastalert_status'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_silence', doc_type='_doc', body=es_index_mappings['silence'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_error', doc_type='_doc', body=es_index_mappings['elastalert_error'], include_type_name=True) es_client.indices.put_mapping(index=ea_index + '_past', doc_type='_doc', body=es_index_mappings['past_elastalert'], include_type_name=True) elif is_atleastsixtwo(esversion): es_client.indices.put_mapping(index=ea_index, doc_type='_doc', body=es_index_mappings['elastalert']) es_client.indices.put_mapping(index=ea_index + '_status', doc_type='_doc', body=es_index_mappings['elastalert_status']) es_client.indices.put_mapping(index=ea_index + '_silence', doc_type='_doc', body=es_index_mappings['silence']) es_client.indices.put_mapping(index=ea_index + '_error', doc_type='_doc', body=es_index_mappings['elastalert_error']) es_client.indices.put_mapping(index=ea_index + '_past', doc_type='_doc', body=es_index_mappings['past_elastalert']) elif is_atleastsix(esversion): es_client.indices.put_mapping(index=ea_index, doc_type='elastalert', body=es_index_mappings['elastalert']) es_client.indices.put_mapping(index=ea_index + '_status', doc_type='elastalert_status', body=es_index_mappings['elastalert_status']) es_client.indices.put_mapping(index=ea_index + '_silence', doc_type='silence', body=es_index_mappings['silence']) es_client.indices.put_mapping(index=ea_index + '_error', doc_type='elastalert_error', body=es_index_mappings['elastalert_error']) es_client.indices.put_mapping(index=ea_index + '_past', doc_type='past_elastalert', body=es_index_mappings['past_elastalert']) else: es_client.indices.put_mapping(index=ea_index, doc_type='elastalert', body=es_index_mappings['elastalert']) es_client.indices.put_mapping(index=ea_index, doc_type='elastalert_status', body=es_index_mappings['elastalert_status']) es_client.indices.put_mapping(index=ea_index, doc_type='silence', body=es_index_mappings['silence']) es_client.indices.put_mapping(index=ea_index, doc_type='elastalert_error', body=es_index_mappings['elastalert_error']) es_client.indices.put_mapping(index=ea_index, doc_type='past_elastalert', body=es_index_mappings['past_elastalert']) print('New index %s created' % ea_index) if old_ea_index: print("Copying all data from old index '{0}' to new index '{1}'".format(old_ea_index, ea_index)) # Use the defaults for chunk_size, scroll, scan_kwargs, and bulk_kwargs elasticsearch.helpers.reindex(es_client, old_ea_index, ea_index) print('Done!')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--src-host", action="store", default="127.0.0.1", type=unicode, help="Source host [default: %(default)s]") parser.add_argument("--src-port", action="store", default=9200, help="Source port [default: %(default)s]") parser.add_argument("--src-index", action="store", default="", type=unicode, help="Source index") parser.add_argument("--src-batch-size", action="store", type=int, default=5000, help="Source query batchsize [default: %(default)s]") parser.add_argument("--src-scroll-interval", action="store", type=unicode, default="60m", help="Interval for source scroll query [default: %(default)s]") parser.add_argument("--dest-host", action="store", default="127.0.0.1", type=unicode, help="Destination host [default: %(default)s]") parser.add_argument("--dest-port", action="store", default=9200, help="Destination port [default: %(default)s]") parser.add_argument("--dest-index", action="store", default="", type=unicode, help="Destination index") parser.add_argument("--dest-batch-size", action="store", type=int, default=5000, help="Destination batchsize [default: %(default)s]") parser.add_argument("--dest-alias", action="store", help="Destination index alias (to be set after we have finished populating)") parser.add_argument("--dest-concurrency", action="store", type=int, default=4, help="Destination batchsize [default: %(default)s]") parser.add_argument("--dest-delete-index", action="store_true", help="Delete destination index at before starting") parser.add_argument("--query", action="store", type=unicode, default="", help="Query to use [if None is specified, a match_all will be used]") args = parser.parse_args() if args.src_index is None or len(args.src_index) == 0: raise Exception("--src-index must be specified!") if args.dest_index is None or len(args.dest_index) == 0: raise Exception("--dest-index must be specified!") dt_start = datetime.now() # copy mapping src_es_instance = get_elasticsearch(args.src_host, args.src_port) dest_es_instance = get_elasticsearch(args.dest_host, args.dest_port) # check if src_index exists src_es_ic = IndicesClient(src_es_instance) if not src_es_ic.exists(args.src_index): raise Exception("--src-index %s does not exist!" % args.src_index) # check if dest_index exists dest_es_ic = IndicesClient(dest_es_instance) if dest_es_ic.exists(args.dest_index): if args.dest_delete_index: dest_es_ic.delete(index=args.dest_index) else: raise Exception("--dest-index %s already exists! Use --dest-delete-index if you want to drop it" % args.dest_index) log.info("Copying mapping...") # copy mapping over to dest src_index_information = src_es_ic.get(index=args.src_index) dest_es_ic.create(index=args.dest_index, body=src_index_information.get(args.src_index, {})) # set num_of_replicas to 0 dest_es_ic.put_settings(index=args.dest_index, body={"settings": {"index": {"number_of_replicas": 0}}}) # perform multiprocessing log.info("Copying data...") MAGIC_STRING = "%s:%s" % (str(uuid4()), str(uuid4())) DEST_QUEUE = Queue() DEST_COUNTER = Value('i', 0) src_process = Process(target=src_worker, args=(args, DEST_QUEUE, MAGIC_STRING)) src_process.start() dest_processes = [Process(target=dest_worker, args=(args, DEST_QUEUE, MAGIC_STRING, DEST_COUNTER)) for i in xrange(args.dest_concurrency)] for i in dest_processes: i.start() src_process.join() for i in dest_processes: i.join() log.info("[dest_worker] Total processed %s" % DEST_COUNTER.value) if args.dest_alias is not None and len(args.dest_alias) > 0: # we remove all existing mappings to this alias, then add it to the current dest_index for idx_name, aliases_mapping in dest_es_ic.get_aliases().iteritems(): if args.dest_alias in aliases_mapping.get("aliases", {}): dest_es_ic.delete_alias(index=idx_name, name=args.dest_alias) dest_es_ic.put_alias(index=args.dest_index, name=args.dest_alias) dest_es_ic.refresh(args.dest_index) dt_end = datetime.now() log.info("Time elapsed: %s" % (dt_end-dt_start, ))
def handle(self, *args, **options): Student.objects.all().delete() University.objects.all().delete() Course.objects.all().delete() start = time.time() # database part # make some Universities university_names = ( 'MIT', 'MGU', 'CalTech', 'KPI', 'DPI', 'PSTU' ) universities = [] for name in university_names: uni = mommy.make(University, name=name) universities.append(uni) # make some courses template_options = ['CS%s0%s', 'MATH%s0%s', 'CHEM%s0%s', 'PHYS%s0%s'] courses = [] for num in range(1, 4): for course_num in range(1, 4): for template in template_options: name = template % (course_num, num) course = mommy.make(Course, name=name) courses.append(course) students = [] for _ in xrange(options.get('count')[0]): stud = mommy.prepare( Student, university=random.choice(universities), first_name=names.get_first_name(), last_name=names.get_last_name(), age=random.randint(17, 25) ) students.append(stud) Student.objects.bulk_create(students) ThroughModel = Student.courses.through stud_courses = [] for student_id in Student.objects.values_list('pk', flat=True): courses_already_linked = [] for _ in range(random.randint(1, 10)): index = random.randint(0, len(courses) - 1) if index not in courses_already_linked: courses_already_linked.append(index) else: continue stud_courses.append( ThroughModel( student_id=student_id, course_id=courses[index].pk ) ) ThroughModel.objects.bulk_create(stud_courses) # recreate index indices_client = IndicesClient(client=settings.ES_CLIENT) if indices_client.exists('django'): indices_client.delete(index='django') indices_client.create(index='django') indices_client.put_mapping( doc_type='student', body=Student._meta.es_mapping, index='django' ) # update part put_all_to_index(Student) finish = time.time() - start print '%s items %s seconds' % (options.get('count')[0], finish)
class ESExporter: def __init__(self, sm_config): self.es = Elasticsearch(hosts=[{"host": sm_config['elasticsearch']['host']}]) self.ind_client = IndicesClient(self.es) def _index(self, annotations): to_index = [] for r in annotations: d = dict(zip(COLUMNS, r)) d['comp_names'] = u'|'.join(d['comp_names']).replace(u'"', u'') d['comp_ids'] = u'|'.join(d['comp_ids']) d['mz'] = '{:010.4f}'.format(d['mz']) if d['mz'] else '' to_index.append({ '_index': 'sm', '_type': 'annotation', '_id': '{}_{}_{}_{}'.format(d['ds_name'], d['db_name'], d['sf'], d['adduct']), '_source': d }) bulk(self.es, actions=to_index, timeout='60s') def _delete(self, annotations): to_delete = [] for r in annotations: d = dict(zip(COLUMNS, r)) to_delete.append({ '_op_type': 'delete', '_index': 'sm', '_type': 'annotation', '_id': '{}_{}_{}_{}'.format(d['ds_name'], d['db_name'], d['sf'], d['adduct']), }) try: bulk(self.es, to_delete) except BulkIndexError as e: logger.warn('{} - {}'.format(e.args[0], e.args[1][1])) def index_ds(self, db, ds_name, db_name): annotations = db.select(RESULTS_TABLE_SQL, ds_name, db_name) logger.info('Deleting documents from the index: {}-{}'.format(ds_name, db_name)) self._delete(annotations) logger.info('Indexing documents: {}-{}'.format(ds_name, db_name)) self._index(annotations) def create_index(self, name='sm'): body = { 'settings': { "index": { 'max_result_window': 2147483647, "analysis": { "analyzer": { "analyzer_keyword": { "tokenizer": "keyword", "filter": "lowercase" } } } } }, 'mappings': { "annotation": { "properties": { "db_name": {"type": "string", "index": "not_analyzed"}, "ds_name": {"type": "string", "index": "not_analyzed"}, "sf": {"type": "string", "index": "not_analyzed"}, "comp_names": { "type": "string", "analyzer": "analyzer_keyword", }, "comp_ids": {"type": "string", "index": "not_analyzed"}, "chaos": {"type": "float", "index": "not_analyzed"}, "image_corr": {"type": "float", "index": "not_analyzed"}, "pattern_match": {"type": "float", "index": "not_analyzed"}, "msm": {"type": "float", "index": "not_analyzed"}, "adduct": {"type": "string", "index": "not_analyzed"}, "fdr": {"type": "float", "index": "not_analyzed"}, "mz": {"type": "string", "index": "not_analyzed"} } } } } if not self.ind_client.exists(name): out = self.ind_client.create(index=name, body=body) logger.info('Index {} created\n{}'.format(name, out)) else: logger.info('Index {} already exists'.format(name)) def delete_index(self, name='sm'): out = self.ind_client.delete(name) logger.info('Index {} deleted\n{}'.format(name, out))
def main(argv): index = 'user_topics' client = Elasticsearch('localhost:9200') index_client = IndicesClient(client) if index_client.exists(index): index_client.delete(index) index_client.create(index=index, body={ 'settings': { 'number_of_shards': 4, 'number_of_replicas': 0 }, 'mappings': { 'user': { 'properties': { #'id': { # 'type': 'long', # 'doc_values': True #}, 'topics': { 'type': 'integer', 'doc_values': True }, 'n_topics': { 'type': 'integer', 'doc_values': True } } } } }) n_users = int(argv[1]) n_topics = int(argv[2]) * 0.15 n_topics_per_user = int(argv[3]) * 4.2 docs_per_chunk = int(2e4) n_chunks = int(ceil(n_users / docs_per_chunk)) start_time = time.time() for i_chunk in range(1, n_chunks+1): docs = [] for i in range(docs_per_chunk): n_user_topics = rand(n_topics_per_user)[0] topics = list(set(rand(n_topics, n_user_topics))) doc_id = str(random.getrandbits(63)) docs.append('{"index":{"_index": "%s", "_type": "user", "_id": "%s"}})' % (index, doc_id)) docs.append(json.dumps({ #'id': doc_id, 'topics': topics, 'n_topics': len(topics) })) #print(json.dumps(json.loads(docs[1]), indent=4)); return try: response = client.bulk(body='\n'.join(docs)) except: # Even when an exception is thrown typically documents were stored in ES sleep_seconds = 10 print('\rHTTP timed out, sleeping %d seconds...' % sleep_seconds) time.sleep(sleep_seconds) print('\rChunk %5d/%d, %5.2f%%' % (i_chunk, n_chunks, i_chunk*100.0/n_chunks), end='') index_time = time.time() print('\nCalling optimize, indexing took %.1f s...' % (index_time - start_time)) sys.stdout.flush() index_client.optimize(index=index, max_num_segments=3, request_timeout=1e6) print('Optimization done in %.1f s' % (time.time() - index_time))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--host', help='Elasticsearch host') parser.add_argument('--port', type=int, help='Elasticsearch port') parser.add_argument('--url-prefix', help='Elasticsearch URL prefix') parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth') parser.add_argument('--ssl', action='store_true', default=None, help='Use SSL') parser.add_argument('--no-ssl', dest='ssl', action='store_false', help='Do not use SSL') parser.add_argument('--index', help='Index name to create') parser.add_argument('--old-index', help='Old index name to copy') parser.add_argument('--boto-profile', default=None, help='Boto profile to use for signing requests') parser.add_argument('--aws-region', default=None, help='AWS Region to use for signing requests') args = parser.parse_args() if os.path.isfile('../config.yaml'): filename = '../config.yaml' elif os.path.isfile('config.yaml'): filename = 'config.yaml' else: filename = '' if filename: with open(filename) as config_file: data = yaml.load(config_file) host = args.host if args.host else data.get('es_host') port = args.port if args.port else data.get('es_port') username = data.get('es_username') password = data.get('es_password') url_prefix = args.url_prefix if args.url_prefix is not None else data.get('es_url_prefix', '') use_ssl = args.ssl if args.ssl is not None else data.get('use_ssl') aws_region = data.get('aws_region', None) else: username = None password = None aws_region = args.aws_region host = args.host if args.host else raw_input('Enter elasticsearch host: ') port = args.port if args.port else int(raw_input('Enter elasticsearch port: ')) use_ssl = (args.ssl if args.ssl is not None else raw_input('Use SSL? t/f: ').lower() in ('t', 'true')) if args.no_auth is None: username = raw_input('Enter optional basic-auth username: '******'Enter optional basic-auth password: '******'Enter optional Elasticsearch URL prefix: ')) auth = Auth() http_auth = auth(host=host, username=username, password=password, aws_region=aws_region, boto_profile=args.boto_profile) es = Elasticsearch( host=host, port=port, use_ssl=use_ssl, connection_class=RequestsHttpConnection, http_auth=http_auth, url_prefix=url_prefix) silence_mapping = {'silence': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'until': {'type': 'date', 'format': 'dateOptionalTime'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} ess_mapping = {'elastalert_status': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} es_mapping = {'elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}, 'alert_time': {'format': 'dateOptionalTime', 'type': 'date'}, 'match_body': {'enabled': False, 'type': 'object'}, 'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}} past_mapping = {'past_elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'match_body': {'enabled': False, 'type': 'object'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}, 'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}} error_mapping = {'elastalert_error': {'properties': {'data': {'type': 'object', 'enabled': False}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} index = args.index if args.index is not None else raw_input('New index name? (Default elastalert_status) ') if not index: index = 'elastalert_status' old_index = (args.old_index if args.old_index is not None else raw_input('Name of existing index to copy? (Default None) ')) res = None if old_index: print('Downloading existing data...') res = es.search(index=old_index, body={}, size=500000) print('Got %s documents' % (len(res['hits']['hits']))) es_index = IndicesClient(es) if es_index.exists(index): print('Index ' + index + ' already exists. Skipping index creation.') return None es.indices.create(index) # To avoid a race condition. TODO: replace this with a real check time.sleep(2) es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) es.indices.put_mapping(index=index, doc_type='past_elastalert', body=past_mapping) print('New index %s created' % index) if res: bulk = ''.join(['%s\n%s\n' % (json.dumps({'create': {'_type': doc['_type'], '_index': index}}), json.dumps(doc['_source'])) for doc in res['hits']['hits']]) print('Uploading data...') es.bulk(body=bulk, index=index) print('Done!')