def test_all_documents_get_inserted(self): docs = [{"answer": x, '_id': x} for x in range(100)] for ok, item in helpers.streaming_bulk(self.client, docs, index='test-index', doc_type='answers', refresh=True): self.assertTrue(ok) self.assertEquals(100, self.client.count(index='test-index', doc_type='answers')['count']) self.assertEquals({"answer": 42}, self.client.get(index='test-index', doc_type='answers', id=42)['_source'])
def test_rejected_documents_are_retried_at_most_max_retries_times(self): failing_client = FailingBulkClient( self.client, fail_at=(1, 2), fail_with=TransportError(429, "Rejected!", {}) ) docs = [ {"_index": "i", "_type": "_doc", "_id": 47, "f": "v"}, {"_index": "i", "_type": "_doc", "_id": 45, "f": "v"}, {"_index": "i", "_type": "_doc", "_id": 42, "f": "v"}, ] results = list( helpers.streaming_bulk( failing_client, docs, raise_on_exception=False, raise_on_error=False, chunk_size=1, max_retries=1, initial_backoff=0, ) ) self.assertEquals(3, len(results)) self.assertEquals([False, True, True], [r[0] for r in results]) self.client.indices.refresh(index="i") res = self.client.search(index="i") self.assertEquals({"value": 2, "relation": "eq"}, res["hits"]["total"]) self.assertEquals(4, failing_client._called)
def load_data(client, data, index='booklist'): create_index(client, index) list_name = '1001books' # we let the streaming bulk continuously process the commits as they come # in - since the `parse_commits` function is a generator this will avoid # loading all the commits into memory for ok, result in streaming_bulk( client, parse_list(data, list_name), index=index, doc_type='book', chunk_size=50 # keep the batch sizes small for appearances only ): action, result = result.popitem() doc_id = '/%s/%s' % (index, result['_id']) # process the information from ES whether the document has been # successfully indexed if not ok: print('Failed to %s document %s: %r' % (action, doc_id, result)) else: print('Successfully indexed %s' % doc_id)
def test_transport_error_can_becaught(self): failing_client = FailingBulkClient(self.client) docs = [ {'_index': 'i', '_type': 't', '_id': 47, 'f': 'v'}, {'_index': 'i', '_type': 't', '_id': 45, 'f': 'v'}, {'_index': 'i', '_type': 't', '_id': 42, 'f': 'v'}, ] results = list(helpers.streaming_bulk(failing_client, docs, raise_on_exception=False, raise_on_error=False, chunk_size=1)) self.assertEquals(3, len(results)) self.assertEquals([True, False, True], [r[0] for r in results]) exc = results[1][1]['index'].pop('exception') self.assertIsInstance(exc, TransportError) self.assertEquals(599, exc.status_code) self.assertEquals( { 'index': { '_index': 'i', '_type': 't', '_id': 45, 'data': {'f': 'v'}, 'error': "TransportError(599, 'Error!')", 'status': 599 } }, results[1][1] )
def _index_loop(self): try: while not self.should_stop: msgs = [] actions = self._actions(250, msgs) stream = helpers.streaming_bulk( self.es_client, actions, max_chunk_bytes=self.MAX_CHUNK_BYTES, raise_on_error=False, ) start = time.time() for (ok, resp), msg in zip(stream, msgs): if not ok and not (resp.get('delete') and resp['delete']['status'] == 404): raise ValueError(ok, resp, msg) assert len(resp.values()) == 1 _id = list(resp.values())[0]['_id'] assert msg.payload['ids'] == [util.IDObfuscator.decode_id(_id)], '{} {}'.format(msg.payload, util.IDObfuscator.decode_id(_id)) msg.ack() if len(msgs): logger.info('%r: Indexed %d documents in %.02fs', self, len(msgs), time.time() - start) else: logger.debug('%r: Recieved no messages for %.02fs', self, time.time() - start) except Exception as e: client.captureException() logger.exception('%r: _index_loop encountered an unexpected error', self) self.stop()
def record_events(self, events): def _build_bulk_index(event_list): for ev in event_list: traits = {t.name: t.value for t in ev.traits} yield {'_op_type': 'create', '_index': '%s_%s' % (self.index_name, ev.generated.date().isoformat()), '_type': ev.event_type, '_id': ev.message_id, '_source': {'timestamp': ev.generated.isoformat(), 'traits': traits, 'raw': ev.raw}} error = None for ok, result in helpers.streaming_bulk( self.conn, _build_bulk_index(events)): if not ok: __, result = result.popitem() if result['status'] == 409: LOG.info(_LI('Duplicate event detected, skipping it: %s'), result) else: LOG.exception(_LE('Failed to record event: %s'), result) error = storage.StorageUnknownWriteError(result) if self._refresh_on_write: self.conn.indices.refresh(index='%s_*' % self.index_name) while self.conn.cluster.pending_tasks(local=True)['tasks']: pass if error: raise error
def _index_all_blogitems(self): iterator = BlogItem.objects.all() category_names = dict((x.id, x.name) for x in Category.objects.all()) categories = defaultdict(list) for e in BlogItem.categories.through.objects.all(): categories[e.blogitem_id].append(category_names[e.category_id]) es = connections.get_connection() report_every = 100 count = 0 doc_type_name = _get_doc_type_name(BlogItem) t0 = time.time() for success, doc in streaming_bulk( es, (m.to_search(all_categories=categories).to_dict(True) for m in iterator), index=settings.ES_BLOG_ITEM_INDEX, doc_type=doc_type_name, ): if not success: print("NOT SUCCESS!", doc) count += 1 if not count % report_every: print(count) t1 = time.time() self.out("DONE Indexing {} blogitems in {} seconds".format(count, t1 - t0))
def flush_to_es(): """ Flushes a stream of messages to elasticsearch using bulk flushing. Uses a generator to pull messages off the queue and passes this as an iterable to the streaming_bulk method. streaming_bulk is also a generator that yields message data used for acking from the queue after they are flushed. :param bulk_size: the number of messages to flush at once to elasticsearch :param bulk_timeout: :return: length of time to wait for a message from queue """ while True: try: es_client = es_handler.connection ack_list = list() actions = get_queue_stream(ack_list) bulker = es_helpers.streaming_bulk( es_client, actions, chunk_size=BULK_SIZE) _LOG.error("Post flush") for response in bulker: msg = ack_list.pop(0) msg_ok = response[0] if msg_ok: msg.ack() except Exception as ex: _LOG.exception(ex)
def load_repo(client, path=None, index='git'): """ Parse a git repository with all it's commits and load it into elasticsearch using `client`. If the index doesn't exist it will be created. """ path = dirname(dirname(abspath(__file__))) if path is None else path repo_name = basename(path) repo = git.Repo(path) create_git_index(client, index) # we let the streaming bulk continuously process the commits as they come # in - since the `parse_commits` function is a generator this will avoid # loading all the commits into memory for ok, result in streaming_bulk( client, parse_commits(repo.refs.master.commit, repo_name), index=index, doc_type='doc', chunk_size=50 # keep the batch sizes small for appearances only ): action, result = result.popitem() doc_id = '/%s/doc/%s' % (index, result['_id']) # process the information from ES whether the document has been # successfully indexed if not ok: print('Failed to %s document %s: %r' % (action, doc_id, result)) else: print(doc_id)
def load_report(client, report_path): client.create( index='report', doc_type='header', body={ "mappings": { "header": { "_timestamp": { "enabled": True, "type": "date", "format": "yyyy-MM-dd HH:mm:ss", "store": True, "path": "timestamp" }, "properties": header_properties } } }, ignore=409 # 409 - conflict ) for ok, result in streaming_bulk( client, parse_report(report_path), index="report", doc_type="header"): action, result = result.popitem() doc_id = '/report/%s' % (result['_id']) if not ok: print('Failed to %s document %s: %r' % (action, doc_id, result)) else: print(doc_id) client.indices.refresh(index='report')
def test_actions_remain_unchanged(self): actions = [{"_id": 1}, {"_id": 2}] for ok, item in helpers.streaming_bulk( self.client, actions, index="test-index" ): self.assertTrue(ok) self.assertEquals([{"_id": 1}, {"_id": 2}], actions)
def record_events(self, events): def _build_bulk_index(event_list): for ev in event_list: traits = {t.name: t.value for t in ev.traits} yield {'_op_type': 'create', '_index': '%s_%s' % (self.index_name, ev.generated.date().isoformat()), '_type': ev.event_type, '_id': ev.message_id, '_source': {'timestamp': ev.generated.isoformat(), 'traits': traits, 'raw': ev.raw}} problem_events = [] for ok, result in helpers.streaming_bulk( self.conn, _build_bulk_index(events)): if not ok: __, result = result.popitem() if result['status'] == 409: problem_events.append((models.Event.DUPLICATE, result['_id'])) else: problem_events.append((models.Event.UNKNOWN_PROBLEM, result['_id'])) if self._refresh_on_write: self.conn.indices.refresh(index='%s_*' % self.index_name) while self.conn.cluster.pending_tasks(local=True)['tasks']: pass return problem_events
def index(self, annotation_ids=None): """ Reindex annotations. :param annotation_ids: a list of ids to reindex, reindexes all when `None`. :type annotation_ids: collection :returns: a set of errored ids :rtype: set """ if not annotation_ids: annotations = _all_annotations(session=self.session, windowsize=PG_WINDOW_SIZE) else: annotations = _filtered_annotations(session=self.session, ids=annotation_ids) # Report indexing status as we go annotations = _log_status(annotations) indexing = es_helpers.streaming_bulk(self.es_client.conn, annotations, chunk_size=ES_CHUNK_SIZE, raise_on_error=False, expand_action_callback=self._prepare) errored = set() for ok, item in indexing: if not ok: status = item[self.op_type] was_doc_exists_err = 'document already exists' in status['error'] if self.op_type == 'create' and was_doc_exists_err: continue errored.add(status['_id']) return errored
def main(): es = Elasticsearch([{'host' : sys.argv[1], 'port' : sys.argv[2]}]) index = sys.argv[3] filenames = os.listdir('.') for f in filenames: if f.endswith('.csv'): for a, b in streaming_bulk(es, get_docs(f, index)): print a, b
def index(self, points): for p in points: p['_index'] = self.config['indexer']['idx_name'] p['_type'] = 'policy-metric' results = helpers.streaming_bulk(self.client, points) for status, r in results: if not status: log.debug("index err result %s", r)
def django_import(): # es.indices.delete(index='traffic', ignore=404) # TrafficReport.init() i = 0 for ok, info in streaming_bulk(es, get_provoz(), doc_type="traffic_report", index="traffic"): i += 1 if i % 1000 == 0: print (i, "dokumentu hotovo")
def index_model(self, model_name, ids, es_url=None, es_index=None): # TODO This method should not have to exist anymore es_client = Elasticsearch(es_url or settings.ELASTICSEARCH['URL'], retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT']) action_gen = indexing.ElasticsearchActionGenerator([settings.ELASTICSEARCH['INDEX']], [indexing.FakeMessage(model_name, ids)]) stream = helpers.streaming_bulk(es_client, (x for x in action_gen if x), max_chunk_bytes=10 * 1024 ** 2, raise_on_error=False) for ok, resp in stream: if not ok and not (resp.get('delete') and resp['delete']['status'] == 404): raise ValueError(resp)
def log_to_elasticsearch(data, params, client, index = 'test', doc_type = 'test', chunk_size = 10): try: data_gen = itertools.imap(lambda d: {"_index" : index, "_type" : doc_type, "_op_type" : "index", "_id" : d['id'], "source" : d}, data) for a, b in streaming_bulk(client, data_gen, chunk_size = chunk_size): pass return True except: return False
def streaming_bulk(): results = list(helpers.streaming_bulk( failing_client, [{"a": 42}, {"a": 39}], raise_on_exception=True, max_retries=3, initial_backoff=0 )) return results
def index(self): response = helpers.streaming_bulk(self.es, self._actions, chunk_size=self._configuration["chunk_size"], raise_on_error=self._configuration["raise_on_error"], raise_on_exception=self._configuration["raise_on_exception"]) for ok, result in response: action, result = result.popitem() doc_id = '/commits/%s' % (result['_id']) if not ok: self.logger.error("Failed to insert %s %s %s", action, doc_id, result) else: self.logger.warning("Success %d", ok)
def index_documents(self): models = list(get_indexed_models()) for model in models: self.save_mapping(model) model_instances = model.get_indexable().iterator() docs = (self.to_indexable_dict(d) for d in model_instances) for ok, info in streaming_bulk(self.es, docs): print(" Document with id %s indexed." % info['index']['_id'])
def bulk_load(docs_to_index): conn = connections.get_connection() index = NameVariant._doc_type.index for response in streaming_bulk( conn, docs_to_index, index=index, doc_type=NameVariant._doc_type.name): pass
def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } parent_id = self._get_parent_id(doc_type, doc) if parent_id is not None: document_action["_parent"] = parent_id document_action["_source"] = self._formatter.format_document(doc) yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass
def test_all_documents_get_inserted(self): docs = [{"answer": x, "_id": x} for x in range(100)] for ok, item in helpers.streaming_bulk( self.client, docs, index="test-index", refresh=True ): self.assertTrue(ok) self.assertEquals(100, self.client.count(index="test-index")["count"]) self.assertEquals( {"answer": 42}, self.client.get(index="test-index", id=42)["_source"] )
def index_all(cls, index_name, using=None, **kwargs): def actions_generator(): for obj in cls.index_queryset().iterator(): elastic_data = cls.from_django(obj).to_dict(include_meta=True) elastic_data['_index'] = index_name yield elastic_data client = connections.get_connection(using or cls._doc_type.using) cls.init(index_name) for ok, item in streaming_bulk(client, actions_generator(), chunk_size=100, **kwargs): yield ok, item
def reindex(self): conn = connections.get_connection() docs_to_index = [ ElasticAddress(**p.to_dict()) for p in self] for response in streaming_bulk( conn, ({'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name, '_source': d.to_dict()} for d in docs_to_index)): pass
def index_all(self, docs): actions = map(self.make_index_action, docs) bulk_results = streaming_bulk( self.elastic, actions, raise_on_error=False, raise_on_exception=False, ) for is_successful, response in bulk_results: if not is_successful: print("Error indexing a document: %s" % str(response))
def bulk_load(questions): all_ok = True es_questions = (q.as_elasticsearch_dict() for q in questions) for ok, result in streaming_bulk(get_client(), es_questions, index=settings.ES_INDEX, raise_on_error=False): if not ok: all_ok = False action, result = result.popitem() logger.error(FAILED_TO_LOAD_ERROR.format(result['_id'], result)) return all_ok
def _index_loop(self): try: while not self.should_stop: msgs = [] actions = self._actions(250, msgs) tries = 0 while not self.should_stop: stream = helpers.streaming_bulk( self.es_client, actions, max_chunk_bytes=self.MAX_CHUNK_BYTES, raise_on_error=False, ) start = time.time() try: for (ok, resp), msg in zip(stream, msgs): if not ok and not (resp.get('delete') and resp['delete']['status'] == 404): raise ValueError(ok, resp, msg) assert len(resp.values()) == 1 _id = list(resp.values())[0]['_id'] assert msg.payload['ids'] == [ util.IDObfuscator.decode_id(_id) ], '{} {}'.format(msg.payload, util.IDObfuscator.decode_id(_id)) msg.ack() if len(msgs): logger.info('%r: Indexed %d documents in %.02fs', self, len(msgs), time.time() - start) else: logger.debug('%r: Recieved no messages for %.02fs', self, time.time() - start) break except ConnectionTimeout: if tries >= self.TIMEOUT_RETRIES: raise tries += 1 logger.warning( 'Connection to elasticsearch timed out. Trying again after %s sec...', self.TIMEOUT_INTERVAL) time.sleep(self.TIMEOUT_INTERVAL) continue except Exception as e: client.captureException() logger.exception('%r: _index_loop encountered an unexpected error', self) self.should_stop = True raise SystemExit(1)
def _copy_data(self): ss_kw = {} # sort if self.source_sort: ss_kw['sort'] = self.source_sort scroll = self.source_es.search(index=self.source_index, scroll='1m', search_type='scan', size=self.bulk_size, version=True, timeout='60s', **ss_kw) sid = scroll['_scroll_id'] total_size = scroll['hits']['total'] hits_size = total_size dealt_size = 0 print("docs: " + str(total_size)) self.logger.info("docs: " + str(total_size)) suffix = '%(percent)d%% - %(index)d [%(elapsed_td)s / %(eta_td)s]' bar = ShadyBar("clone", suffix=suffix, max=total_size) while (hits_size > 0): scroll = self.source_es.scroll(scroll_id=sid, scroll='1m') sid = scroll['_scroll_id'] hits = scroll['hits']['hits'] hits_size = len(hits) actions = self._bulk_hits(hits) if (len(actions) > 0): kw = {} kw['timeout'] = '60s' res = [] try: res = streaming_bulk(client=self.target_es, actions=actions, **kw) except BulkIndexError as err: print(err) pass okNum = 0 for ok, re in res: if not ok: print(re) else: okNum += 1 # refresh index if (okNum > 0): self.target_es.indices.refresh(index=self.target_index) # dealt size dealt_size += hits_size bar.goto(dealt_size) self.logger.info("dealt: " + str(dealt_size) + " / " + str(total_size)) print('\nDone !') self.logger.info("Done ! \n\n")
def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_kilt_snippets_100w"): index_config = { "settings": { "number_of_shards": 1, "analysis": { "analyzer": { "stop_standard": { "type": "standard", " stopwords": "_english_" } } }, }, "mappings": { "properties": { "article_title": { "type": "text", "analyzer": "standard", "similarity": "BM25" }, "section_title": { "type": "text", "analyzer": "standard", "similarity": "BM25" }, "passage_text": { "type": "text", "analyzer": "standard", "similarity": "BM25" }, } }, } es_client.indices.create(index=index_name, body=index_config) number_of_docs = passages_dset.num_rows progress = tqdm(unit="docs", total=number_of_docs) successes = 0 def passage_generator(): for passage in passages_dset: yield passage # create the ES index for ok, action in streaming_bulk( client=es_client, index=index_name, actions=passage_generator(), ): progress.update(1) successes += ok print("Indexed %d documents" % (successes, ))
def test_all_documents_get_inserted(self): docs = [{"answer": x, "_id": x} for x in range(100)] for ok, item in helpers.streaming_bulk(self.client, docs, index="test-index", refresh=True): self.assertTrue(ok) self.assertEquals(100, self.client.count(index="test-index")["count"]) self.assertEquals({"answer": 42}, self.client.get(index="test-index", id=42)["_source"])
def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) # set class variables from filename self.parse_filename() inserts = 0 updates = 0 noops = 0 fails = 0 LOGGER.debug('Received file {}'.format(self.filepath)) chunk_size = 80000 # check for shapefile dependencies if self.check_shapefile_deps(): # deactivate old forecasts for current storm name self.deactivate_old_forecasts() # generate geojson features package = self.generate_geojson_features() for ok, response in helpers.streaming_bulk(self.ES, package, chunk_size=chunk_size, request_timeout=30): status = response['update']['result'] if status == 'created': inserts += 1 elif status == 'updated': updates += 1 elif status == 'noop': noops += 1 else: LOGGER.warning('Unhandled status code {}'.format(status)) total = inserts + updates + noops + fails LOGGER.info('Inserted package of {} hurricane {} ({} inserts,' ' {} updates, {} no-ops, {} rejects)'.format( total, self.storm_variable, inserts, updates, noops, fails)) return True else: LOGGER.debug("All Shapefile dependencies not found. Ignoring " "file...") return False
def reindex(self): conn = connections.get_connection() docs_to_index = [ ElasticOwnership(**p.to_dict(include_address=True, include_name_alternatives=True)) for p in self] for response in streaming_bulk( conn, ({'_index': getattr(d.meta, 'index', d._doc_type.index), '_type': d._doc_type.name, '_source': d.to_dict()} for d in docs_to_index)): pass
def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) if (doc['created_at']): print doc['created_at'] doc_id = u(doc.pop("_id")) document_action = { '_index': index, '_type': doc_type, '_id': doc_id, '_source': self._formatter.format_document(doc) } document_meta = { '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, '_source': { 'ns': namespace, '_ts': timestamp } } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass
def add(self, docs): if not self.es.indices.exists(self.index_name): self.create_index() count = 0 for result in streaming_bulk( self.es, docs, raise_on_error=True, index=self.index_name ): count += 1 logger.info("Added %d docs", count)
def submit_elastic_package(self, package, request_size=10000): """ helper function to send an update request to Elasticsearch and log the status of the request. Returns True if the upload succeeded. :param package: Iterable of bulk API update actions. :param request_size: Number of documents to upload per request. :returns: `bool` of whether the operation was successful. """ inserts = 0 updates = 0 noops = 0 errors = [] try: for ok, response in streaming_bulk( self.Elasticsearch, package, chunk_size=request_size, request_timeout=MSC_PYGEOAPI_ES_TIMEOUT, raise_on_error=False, ): if not ok: errors.append(response) else: status = response['update']['result'] if status == 'created': inserts += 1 elif status == 'updated': updates += 1 elif status == 'noop': noops += 1 else: LOGGER.error('Unhandled status code {}'.format(status)) errors.append(response) except BulkIndexError as err: LOGGER.error('Unable to perform bulk insert due to: {}'.format( err.errors)) return False total = inserts + updates + noops LOGGER.info('Inserted package of {} documents ({} inserts, {} updates,' ' {} no-ops)'.format(total, inserts, updates, noops)) if len(errors) > 0: LOGGER.warning('{} errors encountered in bulk insert: {}'.format( len(errors), errors)) return False return True
def index_messages(indexed_messages, messages): num_messages = len(messages) successes = 0 for ok, action in streaming_bulk(client=client, index=index_name, actions=extract_es_messages( indexed_messages, messages)): successes += ok if (successes != num_messages): print('Warning!: only %d/%d messages were indexed' % (successes, num_messages)) print('Processed ' + str(len(messages)) + ' messages')
def version_compatible_streaming_bulk( es_client, docs, index, chunk_size, raise_on_error, doc_type ): if is_es_version_7(es_client): return streaming_bulk( es_client, docs, index=index, chunk_size=chunk_size, raise_on_error=raise_on_error, ) else: return streaming_bulk( es_client, docs, index=index, doc_type=doc_type, chunk_size=chunk_size, raise_on_error=raise_on_error, )
def bulk_elasticsearch(r_queue, w_lock, dbs, db_name): ES_LOGGER.info("Bulk Host: %s DB: %s Start" % (dbs["db_host"], db_name)) es = Elasticsearch(dbs["es_colony"], retry_on_timeout=True, max_retries=3, timeout=3600) flag = True bulks = [] data_lines_number = 0 bulk_length = 0 while flag: while not r_queue.empty(): if bulk_length == 0: w_lock.acquire() data = r_queue.get() data_lines_number += 1 bulk_length += 1 if bulk_length >= BULK_LENGTH or r_queue.empty(): w_lock.release() if isinstance(data, str) and data == "False": try: ES_LOGGER.info("Bulk Host: %s DB: %s Data: %s" % (dbs["db_host"], db_name, bulk_length)) streaming_bulks = helpers.streaming_bulk(es, bulks, chunk_size=len(bulks)) for streaming_bulk in streaming_bulks: if streaming_bulk[0]: pass bulks = [] except Exception, e: ES_LOGGER.warning(e) flag = False break bulks.append({"_index": dbs["index"], "_type": dbs["doc_type"], "_source": data}) if bulk_length >= BULK_LENGTH: try: ES_LOGGER.info("Bulk Host: %s DB: %s Data: %s" % (dbs["db_host"], db_name, data_lines_number)) streaming_bulks = helpers.streaming_bulk(es, bulks, chunk_size=len(bulks)) for streaming_bulk in streaming_bulks: if streaming_bulk[0]: pass bulks = [] bulk_length = 0 except Exception, e: ES_LOGGER.warning("Bulk Error! %s", e)
def upload(informat, name, order, data, elastic, index, typ, sql=False, verbose=True, with_id=False): """ Uploads the data to elastic and the database sql if True, the data will be stored in the SQL data base as well as ElasticSearch if False, the data will only be stored in ElasticSearch informat can either be xml - lmf json - a single json object or a list of objects bulk - a list of json objects annotated with index and type information, as accepted by ElasticSearch """ try: # The actual parsing data = parse_upload(informat, name, order, data, index, typ, with_id=with_id) except Exception: print 'Error while reading data from %s' % name raise ok = 0 if sql: # stream entries one by one to elastic, then update sql db # streaming_bulk will notify us at once when an entry fails sql_bulk = [] for res in helpers.streaming_bulk(elastic, data): # res is a tuple, res[0]==True ansname = 'index' if with_id else 'create' _id = res[1].get(ansname).get('_id') source = data[ok].get('_source') if isinstance(source, dict): source = json.dumps(source) sql_bulk.append((_id, source, 'admin', 'entry automatically added or reloaded', name, 'imported')) ok += 1 db_loaded, db_error = db.update_bulk(name, sql_bulk) if db_error: raise Exception(db_error) ok += db_loaded else: # upload all at once to elastic ok, err = helpers.bulk(elastic, data) if err: msg = "Error during upload. %s documents successfully uploaded. \ Message: %s.\n" raise Exception(msg % (ok, '\n'.join(err))) if not ok: raise Exception("No data") print >> sys.stderr, "Warning. 0 documents uploaded\n" if verbose: print "Ok. %s documents uploaded\n" % ok
def _bulk( self, index: str, docs: Generator, chunk_size: int, max_chunk_bytes: int, queue_size: int, thread_count: int, refresh: bool, max_retries: int, initial_backoff: int, max_backoff: int, raise_on_exception: bool, raise_on_error: bool, ): """Bulk index, update, delete docs to Elasticsearch.""" # when using multiple threads for poll_db we need to account for other # threads performing deletions ignore_status: Tuple[int] = (400, 404) if ELASTICSEARCH_STREAMING_BULK: for _ in helpers.streaming_bulk( self.__es, docs, index=index, chunk_size=chunk_size, max_chunk_bytes=max_chunk_bytes, max_retries=max_retries, max_backoff=max_backoff, initial_backoff=initial_backoff, refresh=refresh, raise_on_exception=raise_on_exception, raise_on_error=raise_on_error, ): self.doc_count += 1 else: # parallel bulk consumes more memory and is also more likely # to result in 429 errors. for _ in helpers.parallel_bulk( self.__es, docs, thread_count=thread_count, chunk_size=chunk_size, max_chunk_bytes=max_chunk_bytes, queue_size=queue_size, refresh=refresh, raise_on_exception=raise_on_exception, raise_on_error=raise_on_error, ignore_status=ignore_status, ): self.doc_count += 1
def handle_command(self, doc, namespace, timestamp): # Flush buffer before handle command self.commit() db = namespace.split(".", 1)[0] if doc.get("dropDatabase"): raise errors.OperationFailed( "elastic_doc_manager does not support drop database.") # dbs = self.command_helper.map_db(db) # for _db in dbs: # self.elastic.indices.delete(index=_db.lower()) if doc.get("renameCollection"): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get("create"): db, coll = self.command_helper.map_collection(db, doc["create"]) if db and coll: # Elasticsearch 7 remove type concept, need map to MongoDB table name to ES index index = '{db}_{tb}'.format(db=db.lower(), tb=coll) # self.elastic.indices.put_mapping( # index=index, doc_type='_doc', body={"_source": {"enabled": True}} # ) # self.elastic.indices.put_mapping( # index=index, body={"_source" : {"enabled" : True}} # ) # by pass table creation in mongodb, ES' index creation is in lazy module. warnings.warn( "by pass table creation in mongodb, in ES7' index creation is as lazy module. %s on index %s." % (coll, db)) if doc.get("drop"): db, coll = self.command_helper.map_collection(db, doc["drop"]) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn("Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db)) # Elasticsearch 7 remove type concept, need map to MongoDB table name to ES index index = '{db}_{tb}'.format(db=db.lower(), tb=coll) responses = streaming_bulk( self.elastic, (dict(result, _op_type="delete") for result in scan( self.elastic, index=index, doc_type='_doc')), ) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp)
def bulk_index(self, records_uuids, request_timeout=None, max_chunk_bytes=None): """Starts bulk indexing for specified records Args: records_uuids(list[str): List of strings which are UUID's of records to reindex request_timeout(int): Maximum time after which es will throw an exception Returns: dict: dict with success count and failure list (with uuids of failed records) """ if not request_timeout: request_timeout = current_app.config[ "INDEXER_BULK_REQUEST_TIMEOUT"] max_chunk_bytes = max_chunk_bytes or 100 * 1014 * 1024 # default ES setting result = streaming_bulk( es, self.bulk_iterator(records_uuids), request_timeout=request_timeout, raise_on_error=False, raise_on_exception=False, expand_action_callback=(_es7_expand_action), max_retries=5, # Retires on Error 429 initial_backoff=10, # wait for initial_backoff * 2^retry_number, max_chunk_bytes=max_chunk_bytes, ) failures = [] for action_success, action_data in result: if not action_success: failures.append({ "status_code": action_data["index"]["status"], "error_type": str(get_value(action_data, "index.error.type", "")), "falure_reason": str(get_value(action_data, "index.error.reason", "")), }) number_of_failures = len(failures) return { "uuids": records_uuids, "success_count": len(records_uuids) - number_of_failures, "failures_count": number_of_failures, "failures": failures, }
def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): if "Groups" in namespace: LOG.error("DEBUGG:: es bulk upsert groups _ids: %s" % [x.get("_id") for x in docs]) doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = str(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc), } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp }, } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw["chunk_size"] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass
def merge_events(index_alias, events: List[Dict]): connection = es.connection try: # Index payload_length = len(events) index_start_time = time.time() actions = build_actions(events) updated, errors = [], [] success, failed = 0, 0 for ok, item in streaming_bulk(connection, actions, index=index_alias, _source=True): if not ok: errors.append(item) failed += 1 else: updated.append(item["update"]["get"]["_source"]) success += 1 index_spent = time.time() - index_start_time logger.debug( f"--- Indexed {payload_length} in {index_spent} seconds, " f"Index latency: {(index_spent / payload_length) * 1000}ms ---") # Finalize if not failed: fanout(updated) return {"success": success}, 201 else: return { "success": success, "failed": failed, "errors": errors }, 400 except es_exceptions.ConnectionError: return responses.search_backend_unavailable except es_exceptions.RequestError as e: logger.error(e.info) return f"Request error", 409 except bulk_errors.BulkIndexError as e: ignorable_errors = ["max_bytes_length_exceeded_exception"] for error in e.errors: try: err = error["update"]["error"]["caused_by"]["type"] if err in ignorable_errors: logger.warning( f"Payload caused an error {err} and leek did not index it!" ) return "Processed", 201 except KeyError: pass logger.error(e.errors) return f"Bulk update error", 409
def test_transport_error_can_becaught(self): failing_client = FailingBulkClient(self.client) docs = [ { "_index": "i", "_type": "_doc", "_id": 47, "f": "v" }, { "_index": "i", "_type": "_doc", "_id": 45, "f": "v" }, { "_index": "i", "_type": "_doc", "_id": 42, "f": "v" }, ] results = list( helpers.streaming_bulk( failing_client, docs, raise_on_exception=False, raise_on_error=False, chunk_size=1, )) self.assertEquals(3, len(results)) self.assertEquals([True, False, True], [r[0] for r in results]) exc = results[1][1]["index"].pop("exception") self.assertIsInstance(exc, TransportError) self.assertEquals(599, exc.status_code) self.assertEquals( { "index": { "_index": "i", "_type": "_doc", "_id": 45, "data": { "f": "v" }, "error": "TransportError(599, 'Error!')", "status": 599, } }, results[1][1], )
def index_documents(path: str, name: str): """Use the streaming bulk API to index some documents""" # TODO: inject hostname es = Elasticsearch(hosts=[{'host': 'localhost', 'port': 49200}]) for ok, result in streaming_bulk(es, file_iterable(path, name)): action, result = result.popitem() doc_id = '/%s/doc/%s' % (name, result['_id']) # process the information from ES whether the document has been # successfully indexed if not ok: print('Failed to %s document %s: %r' % (action, doc_id, result)) else: print(doc_id)
def bulk_load(movies): all_ok = True es_movies = (q.as_elasticsearch_dict() for q in movies) for ok, result in streaming_bulk(get_client(), es_movies, index=settings.ES_INDEX, raise_on_error=False): #in this step for loop will log any error that occurs while loading the movie if not ok: all_ok = False action, result = result.popitem() logger.error(FAILED_TO_LOAD_ERROR.format(result['_id'], result)) return all_ok
def bulk_operation(cls, index=None, client=None, **options): for ok, result in streaming_bulk( client or cls.client, cls._bulk_stream(**options), index=index or cls.document._default_index(), raise_on_error=False, yield_ok=False, chunk_size=cls.data_bulk_limit ): if not ok: action, result = result.popitem() doc_id = '/%s/_doc/%s' % (index, result['_id']) logger.warning('Failed to {} document {}: {}'.format(action, doc_id, result))
def index(self) -> bool: if not self.get_available_fields().count(): self.task.info(self.task, "No hay series para indexar en este catálogo") return False index_ok = False for success, info in streaming_bulk(self.elastic, self.generate_actions()): if not success: self.task.info(self.task, 'Error indexando: {}'.format(info)) else: index_ok = True return index_ok
def streaming_post_to_es(client, chunk, index_name, job_id=None, doc_type="transaction_mapping"): success, failed = 0, 0 try: for ok, item in helpers.streaming_bulk(client, chunk, index=index_name, doc_type=doc_type): success = [success, success + 1][ok] failed = [failed + 1, failed][ok] except Exception as e: print("MASSIVE FAIL!!!\n\n{}\n\n{}".format(str(e)[:5000], "*" * 80)) raise SystemExit(1) printf({"msg": "Success: {}, Fails: {}".format(success, failed), "job": job_id, "f": "ES Ingest"}) return success, failed
def __add_meta_to_original_index(indices: List[str], index_fields: List[str], show_progress: ShowProgress, query: dict, scroll_size: int, elastic_wrapper: ElasticCore): index_elastic_search = ElasticSearcher( indices=indices, field_data=index_fields, callback_progress=show_progress, query=query, output=ElasticSearcher.OUT_RAW, scroll_size=scroll_size ) index_actions = add_doc_uuid(generator=index_elastic_search) for success, info in streaming_bulk(client=elastic_wrapper.es, actions=index_actions, refresh="wait_for", chunk_size=scroll_size, max_retries=3): if not success: logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))
def es_index(es, index, gffdb, reader, doctype): checkindex(es, index) for ok, result in streaming_bulk(es, reader(gffdb), index=index, doc_type=doctype, chunk_size=chunksize): if not ok: action, result = result.popitem() doc_id = '/%s/commits/%s' % (args.index, result['_id']) print('Failed to %s document %s: %r' % (action, doc_id, result)) es.indices.refresh(index=index) return
def index_fixture_data(self, source_filepath, doc_cls): added_ids = [] for ok, result in streaming_bulk(self.connection, self.prepare_data( source_filepath, doc_cls), refresh=True): action, result = result.popitem() if not ok: raise Exception("Failed to {} document {}: {}".format( action, result["_id"], result)) else: added_ids.append(result["_id"]) return added_ids
def load(client, path='variants.tsv', index='variants'): create_variants_index(client, index) for ok, result in streaming_bulk(client, gen_variants(path), index=index, doc_type='variant', chunk_size=100): action, result = result.popitem() doc_id = '/%s/doc/%s' % (index, result['_id']) if not ok: raise Exception('Failed to %s document %s: %r' % (action, doc_id, result))
def create_docs(client, items): success, failed = 0, 0 for ok, result in streaming_bulk(client, items, index="judgment", doc_type="doc", max_retries=5, chunk_size=250): if not ok: failed += 1 else: success += 1 print(f"Created {success} indexes, with failed {failed}.")
def index(self): """send csv to ES index""" self.logger.info('Setting up Elasticsearch index...') elastic = Elasticsearch(host=self.host, port=self.port, timeout=10000) try: self.logger.info('Creating index %s...' % self.index_name) elastic.indices.create(self.index_name, self.mapping) except RequestError: self.logger.info('Index already exists, skipping...') self.logger.info('Indexing %s...' % self.file) act = (self.format(choices, cid=cid) for cid, choices in self.csv_generator()) list(streaming_bulk(elastic, actions=act))
def save_es_actions(self, datasets_updates): dataset_model = apps.get_model('datasets.Dataset') for dataset_id, data in datasets_updates.items(): dataset_model.objects.filter(pk=dataset_id).update(**data) try: self.views_es_actions['datasets'].append({ '_op_type': 'update', '_index': settings.ELASTICSEARCH_INDEX_NAMES['datasets'], '_type': 'doc', '_id': dataset_id, 'doc': data }) except KeyError: self.views_es_actions['datasets'] = [{ '_op_type': 'update', '_index': settings.ELASTICSEARCH_INDEX_NAMES['datasets'], '_type': 'doc', '_id': dataset_id, 'doc': data }] es_actions = [] for view_actions in self.views_es_actions.values(): es_actions.extend(view_actions) streaming_bulk(connections.get_connection(), es_actions, raise_on_error=False, raise_on_exception=False, max_retries=2)
def bulk_upsert(self, docs): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index = doc.pop("ns") doc_id = u(doc.pop("_id")) timestamp = doc.pop("_ts") document_action = { "_index": index, "_type": self.doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } yield document_action yield document_meta if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: logging.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass