def reset_and_fill_all_indices(): all_orgs = [ 'explaain', 'Matthew_Rusk_62352643', 'yc', 'Andrew_Davies_29862274', 'financialtimes', ] all_index_names = [org.lower() + '__cards' for org in all_orgs] + [ org.lower() + '__files' for org in all_orgs ] + ['organisations', 'sources', 'users'] all_card_indices = [db.Cards(org) for org in all_orgs] all_file_indices = [db.Files(org) for org in all_orgs] # all_indices = all_card_indices + all_file_indices + [db.Organisations(), db.Sources(), db.Users()] all_indices = all_file_indices + [ db.Organisations(), db.Sources(), db.Users() ] template_type = 'cards' template = templates.get_template(template_type) pp.pprint( es_client.IndicesClient(es).put_template(name=template_type, body=template)) # for org in all_orgs: # index_name = org.lower() + '__cards' # print(index_name) # print(json.dumps(es_client.IndicesClient(es).get_mapping(index=index_name, doc_type='card'), indent=2)) # es_client.IndicesClient(es).close(index=index_name) # try: # es_client.IndicesClient(es).put_mapping(index=index_name, doc_type='card', body=cards_template['mappings']['card']) # es_client.IndicesClient(es).put_settings(index=index_name, body=cards_template['settings']) # except Exception as e: # print(e) # es_client.IndicesClient(es).open(index=index_name) template_type = 'files' template = templates.get_template(template_type) pp.pprint( es_client.IndicesClient(es).put_template(name=template_type, body=template)) # for org in all_orgs: # index_name = org.lower() + '__files' # print(index_name) # es_client.IndicesClient(es).put_mapping(index=index_name, doc_type='file', body=files_template['mappings']['file']) # for index_name in all_index_names: # print(index_name) # if es_client.IndicesClient(es).exists(index=index_name): # es_client.IndicesClient(es).delete(index=index_name) # es_client.IndicesClient(es).create(index=index_name) for index in all_indices: copy_docs_from_algolia(index=index)
def create(self): #create indexES instance indexES = client.IndicesClient(self.es) if (self.es.indices.exists(index=self.indexNameES)): #logger.info('index %s already exists', self.indexNameES) #index already exists but it does not mean that the type exists if (self.es.indices.exists_type(index=self.indexNameES, doc_type=[self.typeNameES])): #logger.info('type %s already exists', self.typeNameES) #type already exists nothing to do pass else: #type does not exists, creating it with the mapping to apply #logger.info('type %s does no exist, creating it', self.typeNameES) indexES.put_mapping(doc_type=self.typeNameES, body=self.docMapping, update_all_types=True) else: #index does not exists, neither type (type can't exist without index) #creating both #logger.info('index %s and type %s do not exist, creating them', self.indexNameES, self.typeNameES) indexES.create(index=self.indexNameES) #indicate mapping which applies only on index/type indexES.put_mapping(doc_type=self.typeNameES, body=self.docMapping, update_all_types=True)
def calc_jm(self, query, queryNo, lamb, sumDocLength): jm_scores = {} fjm = open("Results/jm_output.txt",'a') queryArray = [] ic = client.IndicesClient(self.es) analyzedResult = ic.analyze(index="ap_dataset",analyzer="my_english",body=query) tokenLength = len(analyzedResult['tokens']) for i in range(tokenLength): queryArray.append(str(analyzedResult['tokens'][i]['token'])) queryBody = {"query": {"function_score": {"query": {"match": {"text": query}},"functions":[{"script_score": {"script": "getJM", "lang": "groovy", "params": {"query": queryArray, "field":"text", "lamb": lamb, "sumdoclength": sumDocLength }}}], "boost_mode": "replace"}}, "fields":["stream_id"]} jmResult = self.es.search(index="ap_dataset", doc_type="document", size=self.search_size, analyzer="my_english", body=queryBody) resultSize = len(jmResult['hits']['hits'] ) rank = 1 for i in range(resultSize): docId = str(jmResult['hits']['hits'][i]['_id']) score = jmResult['hits']['hits'][i]['_score'] if score != 0: fjm.write(queryNo + " Q0 " + docId + " " + str(rank) + " " + str(score) + " Exp\n") jm_scores[docId] = score rank = rank + 1 fjm.close() return jm_scores
def calc_okapi_bm(self, query, queryNo, avgDocLength, nDocs): okapi_bm_scores = {} fokapiBM = open("Results/okapiBM_output.txt",'a') queryArray = [] ic = client.IndicesClient(self.es) analyzedResult = ic.analyze(index="ap_dataset",analyzer="my_english",body=query) tokenLength = len(analyzedResult['tokens']) for i in range(tokenLength): queryArray.append(str(analyzedResult['tokens'][i]['token'])) queryBody = {"query": {"function_score": {"query": {"match": {"text": query}}, "functions":[{"script_score": {"script": "getOkapiBM", "lang": "groovy", "params": {"query": queryArray, "field":"text", "avgLength": avgDocLength, "ndocs" : nDocs}}}], "boost_mode": "replace"}}, "fields":["stream_id"]} okapiBMResult = self.es.search(index="ap_dataset", doc_type="document", size=self.search_size, analyzer = "my_english", body = queryBody) resultSize = len(okapiBMResult['hits']['hits'] ) rank = 1 for i in range(resultSize): docId = str(okapiBMResult['hits']['hits'][i]['_id']) score = okapiBMResult['hits']['hits'][i]['_score'] if score != 0: fokapiBM.write(queryNo + " Q0 " + docId + " " + str(rank) + " " + str(score) + " Exp\n") okapi_bm_scores[docId] = score rank += 1 fokapiBM.close() return okapi_bm_scores
def __init__(self, cfg): es = Elasticsearch([{'host': cfg["host"], 'port': cfg["port"]}]) esClient = client.IndicesClient(es) self._es = es self._esc = esClient self._index = cfg["index"] pass
def get_size(self, search_service: str = 'elasticsearch'): stats = es_client.IndicesClient(es).stats( index=self.get_index_name('elasticsearch')) size = stats.get('_all', {}).get('primaries', {}).get('docs', {}).get('count', None) return size
def __init__(self, delete_index=True): self.es = Elasticsearch() if delete_index: self.es.indices.delete(index='cnn_news', ignore=[400, 404]) es_analyzer = client.IndicesClient(self.es) es_analyzer.create(index='cnn_news', body=ANALYZER_PARAMS)
def get_index_properties(self): """This is now interpreted as get_settings() for Algolia and get_mapping() for ElasticSearch. It returns both, as { 'algolia': {}, 'elasticsearch': {} } """ algolia_settings = None elasticsearch_mapping = None if UsingAlgolia: try: algolia_settings = self.index.get_settings() except Exception as e: print( 'Algolia: Couldn\'t get settings from index "' + self.get_index_name('algolia') + '". ', e) sentry.captureException() try: elasticsearch_mapping = es_client.IndicesClient(es).get_mapping( index=self.get_index_name('elasticsearch'), doc_type=self.doc_type) except Exception as e: print( 'ElasticSearch: Couldn\'t get settings from index "' + self.get_index_name('elasticsearch') + '". ', e) sentry.captureException() properties = { 'algolia': algolia_settings, 'elasticsearch': elasticsearch_mapping } return properties
def create_index(): """创建索引""" es = Elasticsearch() ic = client.IndicesClient(es) # 如果索引存在则删除 try: ic.delete(index="earthquake") except: pass # 创建索引 ic.create( index="earthquake", body={ "mappings": { "properties": { "time": {"type": "date"}, # 发震时间 "level": {"type": "float"}, # 震级 "geo": {"type": "geo_point"}, # 地理位置 "deep": {"type": "float"}, # 深度 "location": {"type": "text"}, # 位置 "source": {"type": "keyword"} # 数据来源 } } } )
def __init__(self, host=None, port=None, index=None, index_suffix=None): self.host = (host or getattr(conf, 'elasticsearch_host', None) or 'localhost') self.port = (port or getattr(conf, 'elasticsearch_port', None) or 9200) self.index = (index or getattr(conf, 'elasticsearch_index', None) or 'repoxplorer') if index_suffix: self.index += "-%s" % index_suffix if (getattr(conf, 'elasticsearch_user', None) and getattr(conf, 'elasticsearch_password', None)): self.http_auth = "%s:%s" % (getattr( conf, 'elasticsearch_user', None), getattr(conf, 'elasticsearch_password', None)) # NOTE(dpawlik) Opendistro is using self signed certs, # so verify_certs is set to False. self.es = client.Elasticsearch([{ "host": self.host, "port": self.port, "http_auth": self.http_auth, "use_ssl": True, "verify_certs": False, "ssl_show_warn": True }], timeout=60) else: self.es = client.Elasticsearch([{ "host": self.host, "port": self.port }], timeout=60) self.ic = client.IndicesClient(self.es) if not self.ic.exists(index=self.index): self.ic.create(index=self.index) # Give some time to have the index fully created time.sleep(1)
def create_index(): es = Elasticsearch() ic = client.IndicesClient(es) # 判断索引是否存在 if not ic.exists(index="poetry"): # 创建索引 doc = { "mappings": { "properties": { "title": { "type": "keyword" }, "epigraph": { "type": "keyword" }, "dynasty": { "type": "keyword" }, "author": { "type": "keyword" }, "content": { "type": "text" } } } } ic.create(index='poetry', body=doc)
def calc_frequencies(self, query, docno): ic = client.IndicesClient(self.es) sum_tf = 0 sum_df = 0 sum_ttf = 0 query_array = [] analyzed_result = ic.analyze(index="ap_dataset",analyzer="my_english",body=query) token_length = len(analyzed_result['tokens']) for i in range(token_length): query_array.append(str(analyzed_result['tokens'][i]['token'])) res = self.es.termvector(index="ap_dataset", doc_type="document", id=docno, term_statistics=True) term_dict = res['term_vectors']['text']['terms'] for term in query_array: if term in term_dict.keys(): sum_tf += res['term_vectors']['text']['terms'][term]['term_freq'] sum_df += res['term_vectors']['text']['terms'][term]['doc_freq'] sum_ttf += res['term_vectors']['text']['terms'][term]['ttf'] return sum_tf, sum_df, sum_ttf
def create_mapping(self): """ Creates a mapping specific to the current index AND doc_type. No need to check for existence of either based on previous functions""" indice = client.IndicesClient(self.es) indice.put_mapping(index=self.es_main_index, doc_type=self.es_main_type, body=self.mapping)
def create_index(self): """ Check whether index exists, if not create it """ indice = client.IndicesClient(self.es) if not indice.exists(self.es_main_index): indice.create(index=self.es_main_index) return True
def __init__(self, config): if config['is_es_cloud']: self.es = Elasticsearch(cloud_id=config['es_cloud_host'], http_auth=(config['username'], config['password'])) else: self.es = Elasticsearch(host=config['es_host'], port=config['es_port']) self.indicesClient = client.IndicesClient(self.es)
def __set_es(self): es = Elasticsearch(hosts=[{ 'host': self.host, 'port': self.theport }], http_auth=self.awsauth, connection_class=RequestsHttpConnection) if self.__check_es_status(es): return es, client.IndicesClient(es) else: raise Exception("Elasticsearch is not reachable")
def list_indices(self, search_service: str = 'elasticsearch'): if search_service == 'algolia': return self.client.list_indexes() else: indices = es_client.IndicesClient(es).get(index='_all') # @NOTE: Currently only produces property 'name' for each index if not indices: return None return { 'items': [{ 'name': key } for key in indices if key[0] != '.'] }
def index(es, ref_gen, table_name, testing=False, file_len=0): ''' Insert values from ref_gen in the Elasticsearch index INPUT: - ref_gen: a pandas DataFrame generator (ref_gen=pd.read_csv(file_path, chunksize=XXX)) - table_name: name of the Elasticsearch index to insert to - (testing): whether or not to refresh index at each insertion - (file_len): original file len to display estimated time ''' ic = client.IndicesClient(es) # For efficiency, reset refresh interval # see https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-update-settings.html if not testing: low_refresh = {"index": {"refresh_interval": "-1"}} ic.put_settings(low_refresh, table_name) # Bulk insert logging.info('Started indexing') i = 0 t_start = time.time() for ref_tab in ref_gen: ref_tab = pre_process_tab(ref_tab) body = '' for key, doc in ref_tab.where(ref_tab.notnull(), None).to_dict('index').items(): #TODO: make function that limits bulk size index_order = json.dumps({ "index": { "_index": table_name, "_type": 'structure', "_id": str(key) } }) body += index_order + '\n' body += json.dumps(doc) + '\n' es.bulk(body) i += len(ref_tab) # Display progress t_cur = time.time() eta = (file_len - i) * (t_cur - t_start) / i logging.info('Indexed {0} rows / ETA: {1} s'.format(i, eta)) # Back to default refresh if not testing: default_refresh = {"index": {"refresh_interval": "1s"}} ic.put_settings(default_refresh, table_name) es.indices.refresh(index=table_name)
def _indexDocument(self, text): host = self.elasticsearchDomain if (text): service = 'es' ss = boto3.Session() credentials = ss.get_credentials() region = ss.region_name awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) es = Elasticsearch(hosts=[{ 'host': host, 'port': 443 }], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) es_index_client = client.IndicesClient(es) document = { "documentId": "{}".format(self.documentId), "name": "{}".format(self.objectName), "bucket": "{}".format(self.bucketName), "content": text } if not es_index_client.exists(index='textract'): print("Index 'textract' does not exist, creating...") es_index_client.create( index='textract', body={'settings': { 'index': { "number_of_shards": 2 } }}) es.index(index="textract", doc_type="document", id=self.documentId, body=document) print("Indexed document: {}".format(self.objectName))
def template_es(): # This function put mapping for create data structure print('Create mapping for Elasticsearch') connect = connect_to_es() interface = client.IndicesClient(connect) if connect: body = '{"order":0,"template":"*","settings":{},"mappings":{"_default_":{"dynamic_templates":[' \ '{"string_fields":{"mapping":{"index":"analyzed","type":"string","fields":{"raw":{' \ '"index":"not_analyzed","type":"string"}}},"match_mapping_type":"string","match":"*"}}]'\ ',"_all":{"enabled":true}}},"aliases":{}}' template = interface.exists_template(ELASTICSEARCH['template'], ) if template: print('Mapping existis, using it.') else: print('Creating map for use!') interface.put_template(name=ELASTICSEARCH['template'], body=body)
def __init__(self, host=None, port=None, index=None, index_suffix=None): self.host = (host or getattr(conf, 'elasticsearch_host', None) or 'localhost') self.port = (port or getattr(conf, 'elasticsearch_port', None) or 9200) self.index = (index or getattr(conf, 'elasticsearch_index', None) or 'repoxplorer') if index_suffix: self.index += "-%s" % index_suffix self.es = client.Elasticsearch([{ "host": self.host, "port": self.port }], timeout=60) self.ic = client.IndicesClient(self.es) if not self.ic.exists(index=self.index): self.ic.create(index=self.index) # Give some time to have the index fully created time.sleep(1)
def test_es_types(init_interpreter, doc_type): """ Check that no field have "text" type """ interpreter = init_interpreter parser = interpreter[doc_type].parser es = Elasticsearch([{ "host": config.ES["es.nodes"], "port": config.ES["es.port"] }]) indices = client.IndicesClient(es) index_name = list(indices.get_alias(name=parser.name).keys())[0] mapping = indices.get_mapping(index=index_name) for k, t in list( mapping[index_name]["mappings"][doc_type]["properties"].items()): assert t["type"] != "text"
def check_type(self): """ Checks if the type already exists, if it does, first delete the type, then start uploading data """ pass indice = client.IndicesClient(self.es) print(self.es_main_index) if indice.exists_type(index=self.es_main_index, doc_type=self.es_main_type): print('Scenario %s already exists, deleting the current one' % self.es_main_type) indice.delete_mapping(index=self.es_main_index, doc_type=self.es_main_type) print('Waiting for 10 seconds to ensure the current type is ' + 'deleted.') time.sleep(10) return
def __init__(self, aws_access_code, aws_secret_code): self.es = None self.client = None host = 'search-bueventshowcase-2vvvoxbm5u73pdzuqois4u2aru.us-east-2.es.amazonaws.com' awsauth = AWS4Auth(aws_access_code, aws_secret_code, 'us-east-2', 'es') try: # this is hard coded self.indexName = 'defaultevents' self.es = Elasticsearch(hosts=[{ 'host': host, 'port': 443 }], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) self.client = client.IndicesClient(self.es) except: print("Connection to Elasticsearch host %s failed.", self.es)
def __init__(self, items=[], **kwargs): self.bulk_chunk_size = kwargs.get('bulk_chunk_size', config.bulk_chunk_size) self._sort = [] self.results_per_page = kwargs.get('results_per_page', config.results_per_page) self._querybody = querybuilder.QueryBody( ) # sets up the new query bodies if kwargs.get('base_obj'): self.base_obj = kwargs.get('base_obj') else: try: self.base_obj = self.__class__.__model__ except AttributeError: raise AttributeError( 'Base object must contain a model or pass base_obj') self._es = Elasticsearch(config.dsn) self._esc = client.IndicesClient(self._es) if '__index__' in dir(self.base_obj): idx = self.base_obj.__index__ else: idx = config.default_index self._search_params = [] self._raw = {} self.idx = idx self.type = self.base_obj.__type__ self._special_body = {} self._items = items # special list of items that can be committed in bulk # these values are used in the _build_body() to determine where additional _build_body() # options should exist. Defaults to and/must self._last_top_level_boolean = None self._last_boolean = None
def calc_okapi_tf(self, query, query_no, avg_doc_length): """ Calculates the OkapiTf scores :param query: str :param query_no: int :param avg_doc_length: float :return: okapi_tf scores: float """ okapi_tf_scores = {} f_okapi_tf = open("Results/okapi_tf_output.txt",'a') query_array = [] ic = client.IndicesClient(self.es) analyzed_result = ic.analyze(index="ap_dataset",analyzer="my_english",body=query) token_length = len(analyzed_result['tokens']) for i in range(token_length): query_array.append(str(analyzed_result['tokens'][i]['token'])) query_body = {"query": {"function_score": {"query": {"match": {"text": query}}, "functions": [ {"script_score": {"script": "getOkapiTF", "lang": "groovy", "params": {"query": query_array, "field": "text", "avgLength": avg_doc_length}}}], "boost_mode": "replace"}}, "fields":["stream_id"]} okapi_result = self.es.search(index="ap_dataset", doc_type="document", size=self.search_size, analyzer="my_english", body=query_body) result_size = len(okapi_result['hits']['hits']) rank = 1 for i in range(result_size): doc_id = str(okapi_result['hits']['hits'][i]['_id']) score = okapi_result['hits']['hits'][i]['_score'] if score != 0: f_okapi_tf.write(query_no + " Q0 " + doc_id + " " + str(rank) + " " + str(score) + " Exp\n") okapi_tf_scores[doc_id] = score rank += 1 f_okapi_tf.close() return okapi_tf_scores
def delete(self, type): # make sure index exists indice = client.IndicesClient(self.es) try: if indice.exists(self.es_main_index): # if type is 'all' delete everything if type == 'all': try: self.es.delete_by_query(index=self.es_main_index, body=match_all, conflicts='proceed') print('Deleted ' + self.es_main_index) return True except ConnectionError: print( 'There was a connection error. Check your Elastic' + ' Search setting and make sure Elastic Search is' + 'running.') return False elif type: try: if indice.exists_type(index=self.es_main_index, doc_type=type): self.es.delete_by_query(index=self.es_main_index, doc_type=type, body=match_all, conflicts='proceed') print('Deleted ' + self.es_main_index + '/' + type) return True except ConnectionError: print( 'There was a connection error. Check your Elastic' + ' Search setting and make sure Elastic Search is' + 'running.') return False except TransportError: print('Incorrect username or password') return False
def create_index(es, table_name, columns_to_index, default_analyzer='keyword', analyzer_index_settings=None, force=False): ''' Create a new empty Elasticsearch index (used to host documents) INPUT: - es: an Elasticsearch connection - table_name: name of the index in Elasticsearch - columns_to_index: dict containing the columns to index and as values the analyzers to use in addition to the default analyzer Ex: {'col1': {'analyzerA', 'analyzerB'}, 'col2': {}, 'col3': 'analyzerB'} - force: whether or not to delete and re-create an index if the name is already associated to an existing index ''' ic = client.IndicesClient(es) if ic.exists(table_name) and force: ic.delete(table_name) if not ic.exists(table_name): index_settings = gen_index_settings(default_analyzer, columns_to_index, analyzer_index_settings) try: ic.create(table_name, body=json.dumps(index_settings)) except Exception as e: new_message = e.__str__() + '\n\n(MERGE MACHINE)--> This may be due to ' \ 'ES resource not being available. ' \ 'Run es_gen_resource.py (in sudo) for this to work' raise Exception(new_message)
def indexDocument(self, text, entitiesToIndex): if(self.elasticsearchDomain): host = self.elasticsearchDomain if(text): service = 'es' ss = boto3.Session() credentials = ss.get_credentials() region = ss.region_name awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) es = Elasticsearch( hosts=[{'host': host, 'port': 443}], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection ) es_index_client = client.IndicesClient(es) document = { "documentId": "{}".format(self.documentId), "name": "{}".format(self.objectName), "bucket": "{}".format(self.bucketName), "content": text } # add comprehend entities while indexing the document for key, val in entitiesToIndex.items(): key = key.lower() if(key == "date"): for date in val: date_object = format_date(date) if(date_object!= UNSUPPORTED_DATE_FORMAT): if(key not in document): document[key] = [] document[key].append(date_object.strftime("%Y-%m-%d")) print("Document with Converted dates: {}".format(document)) else: document[key] = val try: if not es_index_client.exists(index='textract'): print("Index 'textract' does not exist, creating...") es_index_client.create( index="textract", body={ "settings": { "index": { "number_of_shards": 2 } }, "mappings":{ "document":{ "properties":{ "date":{ "type": "date", "format": "M'/'dd'/'yyyy||date||year||year_month||dd MMM yyyy||dd'/'MM'/'yyyy||yyyy'/'MM'/'dd||dd'/'MM'/'YY||year_month_day||MM'/'dd'/'yy||dd MMM||MM'/'yyyy||M-dd-yyyy||MM'/'dd'/'yyyy||M||d'/'MM'/'yyyy||MM'/'dd'/'yy" } } } } } ) es.index(index="textract", doc_type="document", id=self.documentId, body=json.loads(json.dumps(document))) print("Indexed document: {}".format(self.objectName)) except Exception as E: print("Failed to create index with desired mapping {}".format(E)) else: print("Document not indexed {}".format(self.elasticsearchDomain))
def write_to_es(file_name): """ input parmeter : filename Write user documents of a logfile into ES """ try: # Get file in folder log_dir = os.path.join('logs', file_name) f = open(log_dir, 'r') # read file try: # Find all indices index_client = client.IndicesClient(es_con) print index_client index_name = 'demo_1' #check Index exist or not on list of indices if not index_client.exists(index=index_name): # create new mapping for new index body_dict = { "mappings": { "user": { "dynamic_templates": [{ "string_template": { "match_mapping_type": "string", "mapping": { "index": "not_analyzed", "type": "string" }, "match": "*" } }] } } } # create new index index_client.create(index=index_name, body=body_dict) # Refresh Index index_client.refresh(index=index_name) es_doc_list = [] # get all user doc's one by one from logfile for each_dict in f: try: user_dict = json.loads(each_dict) uid = int(user_dict['uid']) # Update datetime of user doc on each action user_dict['updated'] = datetime.now() try: # check user exist or not uid_exists = es_con.exists(index=index_name, doc_type="user", id=uid) except: uid_exists = None if uid_exists: # update user doc es_doc = { "_op_type": "update", "_index": index_name, "_type": "user", "_id": uid, "script": "ctx._source['name']=name\n ctx._source['age'] = age\n ctx._source['gender'] = gender\n ctx._source['mobile'] = mobile\n ctx._source.events.add(events)\n ctx._source['updated'] = updated", "params": { "name": user_dict['name'], "age": user_dict['age'], "gender": user_dict['gender'], "mobile": user_dict['mobile'], "events": user_dict['events'], "updated": user_dict['updated'] } } else: # create new user doc es_doc = { "_index": index_name, "_type": "user", "_id": uid, "_source": user_dict } es_doc_list.append(es_doc) # Insert document on every BULK_INSERT_SIZE if (len(es_doc_list) == BULK_INSERT_SIZE): helpers.bulk(es_con, es_doc_list) es_doc_list = [] except ValueError as e: print(e) pass # Insert remain documents if es_doc_list: helpers.bulk(es_con, es_doc_list) es_doc_list = [] except (ImproperlyConfigured, ElasticsearchException) as e: print(e) pass f.close() except IOError as e: print(e) pass