Пример #1
0
    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, '_id': x} for x in range(100)]
        for ok, item in helpers.streaming_bulk(self.client, docs, index='test-index', doc_type='answers', refresh=True):
            self.assertTrue(ok)

        self.assertEquals(100, self.client.count(index='test-index', doc_type='answers')['count'])
        self.assertEquals({"answer": 42}, self.client.get(index='test-index', doc_type='answers', id=42)['_source'])
Пример #2
0
    def test_rejected_documents_are_retried_at_most_max_retries_times(self):
        failing_client = FailingBulkClient(
            self.client, fail_at=(1, 2), fail_with=TransportError(429, "Rejected!", {})
        )

        docs = [
            {"_index": "i", "_type": "_doc", "_id": 47, "f": "v"},
            {"_index": "i", "_type": "_doc", "_id": 45, "f": "v"},
            {"_index": "i", "_type": "_doc", "_id": 42, "f": "v"},
        ]
        results = list(
            helpers.streaming_bulk(
                failing_client,
                docs,
                raise_on_exception=False,
                raise_on_error=False,
                chunk_size=1,
                max_retries=1,
                initial_backoff=0,
            )
        )
        self.assertEquals(3, len(results))
        self.assertEquals([False, True, True], [r[0] for r in results])
        self.client.indices.refresh(index="i")
        res = self.client.search(index="i")
        self.assertEquals({"value": 2, "relation": "eq"}, res["hits"]["total"])
        self.assertEquals(4, failing_client._called)
Пример #3
0
def load_data(client, data, index='booklist'):

	create_index(client, index)

	list_name = '1001books'

	# we let the streaming bulk continuously process the commits as they come
    # in - since the `parse_commits` function is a generator this will avoid
    # loading all the commits into memory
	for ok, result in streaming_bulk(
            client,
            parse_list(data, list_name),
            index=index,
            doc_type='book',
            chunk_size=50 # keep the batch sizes small for appearances only
		):

		action, result = result.popitem()
        doc_id = '/%s/%s' % (index, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print('Successfully indexed %s' % doc_id)
Пример #4
0
    def test_transport_error_can_becaught(self):
        failing_client = FailingBulkClient(self.client)
        docs = [
            {'_index': 'i', '_type': 't', '_id': 47, 'f': 'v'},
            {'_index': 'i', '_type': 't', '_id': 45, 'f': 'v'},
            {'_index': 'i', '_type': 't', '_id': 42, 'f': 'v'},
        ]

        results = list(helpers.streaming_bulk(failing_client, docs, raise_on_exception=False, raise_on_error=False, chunk_size=1))
        self.assertEquals(3, len(results))
        self.assertEquals([True, False, True], [r[0] for r in results])

        exc = results[1][1]['index'].pop('exception')
        self.assertIsInstance(exc, TransportError)
        self.assertEquals(599, exc.status_code)
        self.assertEquals(
            {
                'index': {
                    '_index': 'i',
                    '_type': 't',
                    '_id': 45,
                    'data': {'f': 'v'},
                    'error': "TransportError(599, 'Error!')",
                    'status': 599
                }
            },
            results[1][1]
        )
Пример #5
0
    def _index_loop(self):
        try:
            while not self.should_stop:
                msgs = []
                actions = self._actions(250, msgs)

                stream = helpers.streaming_bulk(
                    self.es_client,
                    actions,
                    max_chunk_bytes=self.MAX_CHUNK_BYTES,
                    raise_on_error=False,
                )

                start = time.time()
                for (ok, resp), msg in zip(stream, msgs):
                    if not ok and not (resp.get('delete') and resp['delete']['status'] == 404):
                        raise ValueError(ok, resp, msg)
                    assert len(resp.values()) == 1
                    _id = list(resp.values())[0]['_id']
                    assert msg.payload['ids'] == [util.IDObfuscator.decode_id(_id)], '{} {}'.format(msg.payload, util.IDObfuscator.decode_id(_id))
                    msg.ack()
                if len(msgs):
                    logger.info('%r: Indexed %d documents in %.02fs', self, len(msgs), time.time() - start)
                else:
                    logger.debug('%r: Recieved no messages for %.02fs', self, time.time() - start)
        except Exception as e:
            client.captureException()
            logger.exception('%r: _index_loop encountered an unexpected error', self)
            self.stop()
Пример #6
0
    def record_events(self, events):

        def _build_bulk_index(event_list):
            for ev in event_list:
                traits = {t.name: t.value for t in ev.traits}
                yield {'_op_type': 'create',
                       '_index': '%s_%s' % (self.index_name,
                                            ev.generated.date().isoformat()),
                       '_type': ev.event_type,
                       '_id': ev.message_id,
                       '_source': {'timestamp': ev.generated.isoformat(),
                                   'traits': traits,
                                   'raw': ev.raw}}

        error = None
        for ok, result in helpers.streaming_bulk(
                self.conn, _build_bulk_index(events)):
            if not ok:
                __, result = result.popitem()
                if result['status'] == 409:
                    LOG.info(_LI('Duplicate event detected, skipping it: %s'),
                             result)
                else:
                    LOG.exception(_LE('Failed to record event: %s'), result)
                    error = storage.StorageUnknownWriteError(result)

        if self._refresh_on_write:
            self.conn.indices.refresh(index='%s_*' % self.index_name)
            while self.conn.cluster.pending_tasks(local=True)['tasks']:
                pass
        if error:
            raise error
Пример #7
0
    def _index_all_blogitems(self):
        iterator = BlogItem.objects.all()
        category_names = dict((x.id, x.name) for x in Category.objects.all())
        categories = defaultdict(list)
        for e in BlogItem.categories.through.objects.all():
            categories[e.blogitem_id].append(category_names[e.category_id])

        es = connections.get_connection()
        report_every = 100
        count = 0
        doc_type_name = _get_doc_type_name(BlogItem)
        t0 = time.time()
        for success, doc in streaming_bulk(
            es,
            (m.to_search(all_categories=categories).to_dict(True) for m in iterator),
            index=settings.ES_BLOG_ITEM_INDEX,
            doc_type=doc_type_name,
        ):
            if not success:
                print("NOT SUCCESS!", doc)
            count += 1
            if not count % report_every:
                print(count)
        t1 = time.time()

        self.out("DONE Indexing {} blogitems in {} seconds".format(count, t1 - t0))
Пример #8
0
def flush_to_es():
    """
    Flushes a stream of messages to elasticsearch using bulk flushing.
    Uses a generator to pull messages off the queue and passes this as an
     iterable to the streaming_bulk method.  streaming_bulk is also a generator
     that yields message data used for acking from the queue after they
     are flushed.
    :param bulk_size: the number of messages to flush at once to elasticsearch
    :param bulk_timeout:
    :return: length of time to wait for a message from queue
    """

    while True:

        try:
            es_client = es_handler.connection
            ack_list = list()
            actions = get_queue_stream(ack_list)
            bulker = es_helpers.streaming_bulk(
                es_client, actions, chunk_size=BULK_SIZE)
            _LOG.error("Post flush")

            for response in bulker:
                msg = ack_list.pop(0)
                msg_ok = response[0]

                if msg_ok:
                    msg.ack()

        except Exception as ex:
            _LOG.exception(ex)
Пример #9
0
def load_repo(client, path=None, index='git'):
    """
    Parse a git repository with all it's commits and load it into elasticsearch
    using `client`. If the index doesn't exist it will be created.
    """
    path = dirname(dirname(abspath(__file__))) if path is None else path
    repo_name = basename(path)
    repo = git.Repo(path)

    create_git_index(client, index)

    # we let the streaming bulk continuously process the commits as they come
    # in - since the `parse_commits` function is a generator this will avoid
    # loading all the commits into memory
    for ok, result in streaming_bulk(
            client,
            parse_commits(repo.refs.master.commit, repo_name),
            index=index,
            doc_type='doc',
            chunk_size=50 # keep the batch sizes small for appearances only
        ):
        action, result = result.popitem()
        doc_id = '/%s/doc/%s' % (index, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)
Пример #10
0
def load_report(client, report_path):
    client.create(
        index='report',
        doc_type='header',
        body={
            "mappings": {
                "header": {
                    "_timestamp": {
                        "enabled": True,
                        "type": "date",
                        "format": "yyyy-MM-dd HH:mm:ss",
                        "store": True,
                        "path": "timestamp"
                    },
                    "properties": header_properties
                }
            }
        },
        ignore=409  # 409 - conflict
    )
    for ok, result in streaming_bulk(
            client,
            parse_report(report_path),
            index="report",
            doc_type="header"):
        action, result = result.popitem()
        doc_id = '/report/%s' % (result['_id'])
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)
        client.indices.refresh(index='report')
Пример #11
0
 def test_actions_remain_unchanged(self):
     actions = [{"_id": 1}, {"_id": 2}]
     for ok, item in helpers.streaming_bulk(
         self.client, actions, index="test-index"
     ):
         self.assertTrue(ok)
     self.assertEquals([{"_id": 1}, {"_id": 2}], actions)
Пример #12
0
    def record_events(self, events):

        def _build_bulk_index(event_list):
            for ev in event_list:
                traits = {t.name: t.value for t in ev.traits}
                yield {'_op_type': 'create',
                       '_index': '%s_%s' % (self.index_name,
                                            ev.generated.date().isoformat()),
                       '_type': ev.event_type,
                       '_id': ev.message_id,
                       '_source': {'timestamp': ev.generated.isoformat(),
                                   'traits': traits,
                                   'raw': ev.raw}}

        problem_events = []
        for ok, result in helpers.streaming_bulk(
                self.conn, _build_bulk_index(events)):
            if not ok:
                __, result = result.popitem()
                if result['status'] == 409:
                    problem_events.append((models.Event.DUPLICATE,
                                           result['_id']))
                else:
                    problem_events.append((models.Event.UNKNOWN_PROBLEM,
                                           result['_id']))

        if self._refresh_on_write:
            self.conn.indices.refresh(index='%s_*' % self.index_name)
            while self.conn.cluster.pending_tasks(local=True)['tasks']:
                pass
        return problem_events
Пример #13
0
    def index(self, annotation_ids=None):
        """
        Reindex annotations.

        :param annotation_ids: a list of ids to reindex, reindexes all when `None`.
        :type annotation_ids: collection

        :returns: a set of errored ids
        :rtype: set
        """
        if not annotation_ids:
            annotations = _all_annotations(session=self.session,
                                           windowsize=PG_WINDOW_SIZE)
        else:
            annotations = _filtered_annotations(session=self.session,
                                                ids=annotation_ids)

        # Report indexing status as we go
        annotations = _log_status(annotations)

        indexing = es_helpers.streaming_bulk(self.es_client.conn, annotations,
                                             chunk_size=ES_CHUNK_SIZE,
                                             raise_on_error=False,
                                             expand_action_callback=self._prepare)
        errored = set()
        for ok, item in indexing:
            if not ok:
                status = item[self.op_type]

                was_doc_exists_err = 'document already exists' in status['error']
                if self.op_type == 'create' and was_doc_exists_err:
                    continue

                errored.add(status['_id'])
        return errored
def main():
    es = Elasticsearch([{'host' : sys.argv[1], 'port' : sys.argv[2]}])
    index = sys.argv[3]
    filenames = os.listdir('.')
    for f in filenames:
        if f.endswith('.csv'):
            for a, b in streaming_bulk(es, get_docs(f, index)):
                print a, b
Пример #15
0
 def index(self, points):
     for p in points:
         p['_index'] = self.config['indexer']['idx_name']
         p['_type'] = 'policy-metric'
     results = helpers.streaming_bulk(self.client, points)
     for status, r in results:
         if not status:
             log.debug("index err result %s", r)
Пример #16
0
def django_import():
    # es.indices.delete(index='traffic', ignore=404)
    # TrafficReport.init()
    i = 0
    for ok, info in streaming_bulk(es, get_provoz(), doc_type="traffic_report", index="traffic"):
        i += 1
        if i % 1000 == 0:
            print (i, "dokumentu hotovo")
Пример #17
0
def index_model(self, model_name, ids, es_url=None, es_index=None):
    # TODO This method should not have to exist anymore
    es_client = Elasticsearch(es_url or settings.ELASTICSEARCH['URL'], retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT'])
    action_gen = indexing.ElasticsearchActionGenerator([settings.ELASTICSEARCH['INDEX']], [indexing.FakeMessage(model_name, ids)])
    stream = helpers.streaming_bulk(es_client, (x for x in action_gen if x), max_chunk_bytes=10 * 1024 ** 2, raise_on_error=False)

    for ok, resp in stream:
        if not ok and not (resp.get('delete') and resp['delete']['status'] == 404):
            raise ValueError(resp)
Пример #18
0
def log_to_elasticsearch(data, params, client, index = 'test', doc_type = 'test', chunk_size = 10):
    try:
        data_gen = itertools.imap(lambda d: {"_index" : index, "_type" : doc_type, "_op_type" : "index", "_id" : d['id'], "source" : d}, data)
        for a, b in streaming_bulk(client, data_gen, chunk_size = chunk_size):
            pass
        
        return True
    except:
        return False
Пример #19
0
 def streaming_bulk():
     results = list(helpers.streaming_bulk(
         failing_client,
         [{"a": 42}, {"a": 39}],
         raise_on_exception=True,
         max_retries=3,
         initial_backoff=0
     ))
     return results
Пример #20
0
 def index(self):
     response = helpers.streaming_bulk(self.es, self._actions, chunk_size=self._configuration["chunk_size"], raise_on_error=self._configuration["raise_on_error"], raise_on_exception=self._configuration["raise_on_exception"])
     for ok, result in response:
         action, result = result.popitem()
         doc_id = '/commits/%s' % (result['_id'])
         if not ok:
             self.logger.error("Failed to insert %s %s %s", action, doc_id, result)
         else:
             self.logger.warning("Success %d", ok)
Пример #21
0
    def index_documents(self):
        models = list(get_indexed_models())

        for model in models:
            self.save_mapping(model)

            model_instances = model.get_indexable().iterator()
            docs = (self.to_indexable_dict(d) for d in model_instances)
            for ok, info in streaming_bulk(self.es, docs):
                print("  Document with id %s indexed." % info['index']['_id'])
Пример #22
0
def bulk_load(docs_to_index):
    conn = connections.get_connection()
    index = NameVariant._doc_type.index

    for response in streaming_bulk(
            conn,
            docs_to_index,
            index=index,
            doc_type=NameVariant._doc_type.name):
        pass
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }

                parent_id = self._get_parent_id(doc_type, doc)
                if parent_id is not None:
                    document_action["_parent"] = parent_id
                    document_action["_source"] = self._formatter.format_document(doc)

                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass
Пример #24
0
    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, "_id": x} for x in range(100)]
        for ok, item in helpers.streaming_bulk(
            self.client, docs, index="test-index", refresh=True
        ):
            self.assertTrue(ok)

        self.assertEquals(100, self.client.count(index="test-index")["count"])
        self.assertEquals(
            {"answer": 42}, self.client.get(index="test-index", id=42)["_source"]
        )
Пример #25
0
    def index_all(cls, index_name, using=None, **kwargs):
        def actions_generator():
            for obj in cls.index_queryset().iterator():
                elastic_data = cls.from_django(obj).to_dict(include_meta=True)
                elastic_data['_index'] = index_name
                yield elastic_data

        client = connections.get_connection(using or cls._doc_type.using)
        cls.init(index_name)
        for ok, item in streaming_bulk(client, actions_generator(), chunk_size=100, **kwargs):
            yield ok, item
Пример #26
0
    def reindex(self):
        conn = connections.get_connection()
        docs_to_index = [
            ElasticAddress(**p.to_dict())
            for p in self]

        for response in streaming_bulk(
                conn, ({'_index': getattr(d.meta, 'index', d._doc_type.index),
                        '_type': d._doc_type.name,
                        '_source': d.to_dict()} for d in docs_to_index)):
            pass
Пример #27
0
 def index_all(self, docs):
     actions = map(self.make_index_action, docs)
     bulk_results = streaming_bulk(
         self.elastic,
         actions,
         raise_on_error=False,
         raise_on_exception=False,
     )
     for is_successful, response in bulk_results:
         if not is_successful:
             print("Error indexing a document: %s" % str(response))
Пример #28
0
def bulk_load(questions):
    all_ok = True
    es_questions = (q.as_elasticsearch_dict() for q in questions)
    for ok, result in streaming_bulk(get_client(),
                                     es_questions,
                                     index=settings.ES_INDEX,
                                     raise_on_error=False):
        if not ok:
            all_ok = False
            action, result = result.popitem()
            logger.error(FAILED_TO_LOAD_ERROR.format(result['_id'], result))
    return all_ok
Пример #29
0
    def _index_loop(self):
        try:
            while not self.should_stop:
                msgs = []
                actions = self._actions(250, msgs)
                tries = 0

                while not self.should_stop:
                    stream = helpers.streaming_bulk(
                        self.es_client,
                        actions,
                        max_chunk_bytes=self.MAX_CHUNK_BYTES,
                        raise_on_error=False,
                    )

                    start = time.time()
                    try:
                        for (ok, resp), msg in zip(stream, msgs):
                            if not ok and not (resp.get('delete')
                                               and resp['delete']['status']
                                               == 404):
                                raise ValueError(ok, resp, msg)
                            assert len(resp.values()) == 1
                            _id = list(resp.values())[0]['_id']
                            assert msg.payload['ids'] == [
                                util.IDObfuscator.decode_id(_id)
                            ], '{} {}'.format(msg.payload,
                                              util.IDObfuscator.decode_id(_id))
                            msg.ack()
                        if len(msgs):
                            logger.info('%r: Indexed %d documents in %.02fs',
                                        self, len(msgs),
                                        time.time() - start)
                        else:
                            logger.debug('%r: Recieved no messages for %.02fs',
                                         self,
                                         time.time() - start)
                        break
                    except ConnectionTimeout:
                        if tries >= self.TIMEOUT_RETRIES:
                            raise
                        tries += 1
                        logger.warning(
                            'Connection to elasticsearch timed out. Trying again after %s sec...',
                            self.TIMEOUT_INTERVAL)
                        time.sleep(self.TIMEOUT_INTERVAL)
                        continue
        except Exception as e:
            client.captureException()
            logger.exception('%r: _index_loop encountered an unexpected error',
                             self)
            self.should_stop = True
            raise SystemExit(1)
Пример #30
0
 def _copy_data(self):
     ss_kw = {}
     # sort
     if self.source_sort:
         ss_kw['sort'] = self.source_sort
     scroll = self.source_es.search(index=self.source_index,
                                    scroll='1m',
                                    search_type='scan',
                                    size=self.bulk_size,
                                    version=True,
                                    timeout='60s',
                                    **ss_kw)
     sid = scroll['_scroll_id']
     total_size = scroll['hits']['total']
     hits_size = total_size
     dealt_size = 0
     print("docs: " + str(total_size))
     self.logger.info("docs: " + str(total_size))
     suffix = '%(percent)d%% - %(index)d [%(elapsed_td)s / %(eta_td)s]'
     bar = ShadyBar("clone", suffix=suffix, max=total_size)
     while (hits_size > 0):
         scroll = self.source_es.scroll(scroll_id=sid, scroll='1m')
         sid = scroll['_scroll_id']
         hits = scroll['hits']['hits']
         hits_size = len(hits)
         actions = self._bulk_hits(hits)
         if (len(actions) > 0):
             kw = {}
             kw['timeout'] = '60s'
             res = []
             try:
                 res = streaming_bulk(client=self.target_es,
                                      actions=actions,
                                      **kw)
             except BulkIndexError as err:
                 print(err)
                 pass
             okNum = 0
             for ok, re in res:
                 if not ok:
                     print(re)
                 else:
                     okNum += 1
             # refresh index
             if (okNum > 0):
                 self.target_es.indices.refresh(index=self.target_index)
         # dealt size
         dealt_size += hits_size
         bar.goto(dealt_size)
         self.logger.info("dealt: " + str(dealt_size) + " / " +
                          str(total_size))
     print('\nDone !')
     self.logger.info("Done ! \n\n")
Пример #31
0
def make_es_index_snippets(es_client,
                           passages_dset,
                           index_name="english_wiki_kilt_snippets_100w"):
    index_config = {
        "settings": {
            "number_of_shards": 1,
            "analysis": {
                "analyzer": {
                    "stop_standard": {
                        "type": "standard",
                        " stopwords": "_english_"
                    }
                }
            },
        },
        "mappings": {
            "properties": {
                "article_title": {
                    "type": "text",
                    "analyzer": "standard",
                    "similarity": "BM25"
                },
                "section_title": {
                    "type": "text",
                    "analyzer": "standard",
                    "similarity": "BM25"
                },
                "passage_text": {
                    "type": "text",
                    "analyzer": "standard",
                    "similarity": "BM25"
                },
            }
        },
    }
    es_client.indices.create(index=index_name, body=index_config)
    number_of_docs = passages_dset.num_rows
    progress = tqdm(unit="docs", total=number_of_docs)
    successes = 0

    def passage_generator():
        for passage in passages_dset:
            yield passage

    # create the ES index
    for ok, action in streaming_bulk(
            client=es_client,
            index=index_name,
            actions=passage_generator(),
    ):
        progress.update(1)
        successes += ok
    print("Indexed %d documents" % (successes, ))
Пример #32
0
    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, "_id": x} for x in range(100)]
        for ok, item in helpers.streaming_bulk(self.client,
                                               docs,
                                               index="test-index",
                                               refresh=True):
            self.assertTrue(ok)

        self.assertEquals(100, self.client.count(index="test-index")["count"])
        self.assertEquals({"answer": 42},
                          self.client.get(index="test-index",
                                          id=42)["_source"])
Пример #33
0
    def load_data(self, filepath):
        """
        loads data from event to target
        :returns: `bool` of status result
        """

        self.filepath = Path(filepath)

        # set class variables from filename
        self.parse_filename()

        inserts = 0
        updates = 0
        noops = 0
        fails = 0

        LOGGER.debug('Received file {}'.format(self.filepath))
        chunk_size = 80000

        # check for shapefile dependencies
        if self.check_shapefile_deps():

            # deactivate old forecasts for current storm name
            self.deactivate_old_forecasts()

            # generate geojson features
            package = self.generate_geojson_features()
            for ok, response in helpers.streaming_bulk(self.ES,
                                                       package,
                                                       chunk_size=chunk_size,
                                                       request_timeout=30):
                status = response['update']['result']

                if status == 'created':
                    inserts += 1
                elif status == 'updated':
                    updates += 1
                elif status == 'noop':
                    noops += 1
                else:
                    LOGGER.warning('Unhandled status code {}'.format(status))

            total = inserts + updates + noops + fails
            LOGGER.info('Inserted package of {} hurricane {} ({} inserts,'
                        ' {} updates, {} no-ops, {} rejects)'.format(
                            total, self.storm_variable, inserts, updates,
                            noops, fails))
            return True

        else:
            LOGGER.debug("All Shapefile dependencies not found. Ignoring "
                         "file...")
            return False
Пример #34
0
    def reindex(self):
        conn = connections.get_connection()
        docs_to_index = [
            ElasticOwnership(**p.to_dict(include_address=True,
                             include_name_alternatives=True))
            for p in self]

        for response in streaming_bulk(
                conn, ({'_index': getattr(d.meta, 'index', d._doc_type.index),
                        '_type': d._doc_type.name,
                        '_source': d.to_dict()} for d in docs_to_index)):
            pass
Пример #35
0
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)

                if (doc['created_at']):
                    print doc['created_at']

                doc_id = u(doc.pop("_id"))
                document_action = {
                    '_index': index,
                    '_type': doc_type,
                    '_id': doc_id,
                    '_source': self._formatter.format_document(doc)
                }
                document_meta = {
                    '_index': self.meta_index_name,
                    '_type': self.meta_type,
                    '_id': doc_id,
                    '_source': {
                        'ns': namespace,
                        '_ts': timestamp
                    }
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error("Could not bulk-upsert document "
                              "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass
 def add(self, docs):
     if not self.es.indices.exists(self.index_name):
         self.create_index()
     count = 0
     for result in streaming_bulk(
         self.es,
         docs,
         raise_on_error=True,
         index=self.index_name
     ):
         count += 1
     logger.info("Added %d docs", count)
Пример #37
0
    def submit_elastic_package(self, package, request_size=10000):
        """
        helper function to send an update request to Elasticsearch and
        log the status of the request. Returns True if the upload succeeded.

        :param package: Iterable of bulk API update actions.
        :param request_size: Number of documents to upload per request.

        :returns: `bool` of whether the operation was successful.
        """

        inserts = 0
        updates = 0
        noops = 0
        errors = []

        try:
            for ok, response in streaming_bulk(
                    self.Elasticsearch,
                    package,
                    chunk_size=request_size,
                    request_timeout=MSC_PYGEOAPI_ES_TIMEOUT,
                    raise_on_error=False,
            ):
                if not ok:
                    errors.append(response)
                else:
                    status = response['update']['result']

                    if status == 'created':
                        inserts += 1
                    elif status == 'updated':
                        updates += 1
                    elif status == 'noop':
                        noops += 1
                    else:
                        LOGGER.error('Unhandled status code {}'.format(status))
                        errors.append(response)
        except BulkIndexError as err:
            LOGGER.error('Unable to perform bulk insert due to: {}'.format(
                err.errors))
            return False

        total = inserts + updates + noops
        LOGGER.info('Inserted package of {} documents ({} inserts, {} updates,'
                    ' {} no-ops)'.format(total, inserts, updates, noops))

        if len(errors) > 0:
            LOGGER.warning('{} errors encountered in bulk insert: {}'.format(
                len(errors), errors))
            return False

        return True
def index_messages(indexed_messages, messages):
    num_messages = len(messages)
    successes = 0
    for ok, action in streaming_bulk(client=client,
                                     index=index_name,
                                     actions=extract_es_messages(
                                         indexed_messages, messages)):
        successes += ok
    if (successes != num_messages):
        print('Warning!: only %d/%d messages were indexed' %
              (successes, num_messages))
    print('Processed ' + str(len(messages)) + ' messages')
Пример #39
0
def version_compatible_streaming_bulk(
    es_client, docs, index, chunk_size, raise_on_error, doc_type
):

    if is_es_version_7(es_client):
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        )
    else:
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            doc_type=doc_type,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        )
Пример #40
0
def bulk_elasticsearch(r_queue, w_lock, dbs, db_name):
    ES_LOGGER.info("Bulk Host: %s DB: %s Start" % (dbs["db_host"], db_name))
    es = Elasticsearch(dbs["es_colony"], retry_on_timeout=True, max_retries=3, timeout=3600)
    flag = True
    bulks = []
    data_lines_number = 0
    bulk_length = 0
    while flag:
        while not r_queue.empty():
            if bulk_length == 0:
                w_lock.acquire()
            data = r_queue.get()
            data_lines_number += 1
            bulk_length += 1
            if bulk_length >= BULK_LENGTH or r_queue.empty():
                w_lock.release()
            if isinstance(data, str) and data == "False":
                try:
                    ES_LOGGER.info("Bulk Host: %s DB: %s Data: %s" % (dbs["db_host"], db_name, bulk_length))
                    streaming_bulks = helpers.streaming_bulk(es, bulks, chunk_size=len(bulks))
                    for streaming_bulk in streaming_bulks:
                        if streaming_bulk[0]:
                            pass
                    bulks = []
                except Exception, e:
                    ES_LOGGER.warning(e)
                flag = False
                break
            bulks.append({"_index": dbs["index"], "_type": dbs["doc_type"], "_source": data})
            if bulk_length >= BULK_LENGTH:
                try:
                    ES_LOGGER.info("Bulk Host: %s DB: %s Data: %s" % (dbs["db_host"], db_name, data_lines_number))
                    streaming_bulks = helpers.streaming_bulk(es, bulks, chunk_size=len(bulks))
                    for streaming_bulk in streaming_bulks:
                        if streaming_bulk[0]:
                            pass
                    bulks = []
                    bulk_length = 0
                except Exception, e:
                    ES_LOGGER.warning("Bulk Error! %s", e)
Пример #41
0
def upload(informat, name, order, data, elastic, index, typ, sql=False,
           verbose=True, with_id=False):
    """ Uploads the data to elastic and the database
        sql      if True,  the data will be stored in the SQL data base as well
                 as ElasticSearch
                 if False, the data will only be stored in ElasticSearch
        informat can either be xml  - lmf
                               json - a single json object or a list of objects
                               bulk - a list of json objects annotated with
                                      index and type information, as accepted
                                      by ElasticSearch
    """
    try:
        # The actual parsing
        data = parse_upload(informat, name, order, data, index, typ,
                            with_id=with_id)
    except Exception:
        print 'Error while reading data from %s' % name
        raise

    ok = 0
    if sql:
        # stream entries one by one to elastic, then update sql db
        # streaming_bulk will notify us at once when an entry fails
        sql_bulk = []
        for res in helpers.streaming_bulk(elastic, data):
            # res is a tuple, res[0]==True
            ansname = 'index' if with_id else 'create'
            _id = res[1].get(ansname).get('_id')
            source = data[ok].get('_source')
            if isinstance(source, dict):
                source = json.dumps(source)
            sql_bulk.append((_id, source, 'admin',
                             'entry automatically added or reloaded', name,
                             'imported'))
            ok += 1
        db_loaded, db_error = db.update_bulk(name, sql_bulk)
        if db_error:
            raise Exception(db_error)
        ok += db_loaded
    else:
        # upload all at once to elastic
        ok, err = helpers.bulk(elastic, data)
        if err:
            msg = "Error during upload. %s documents successfully uploaded. \
                   Message: %s.\n"
            raise Exception(msg % (ok, '\n'.join(err)))
    if not ok:
        raise Exception("No data")
        print >> sys.stderr, "Warning. 0 documents uploaded\n"
    if verbose:
        print "Ok. %s documents uploaded\n" % ok
Пример #42
0
    def _bulk(
        self,
        index: str,
        docs: Generator,
        chunk_size: int,
        max_chunk_bytes: int,
        queue_size: int,
        thread_count: int,
        refresh: bool,
        max_retries: int,
        initial_backoff: int,
        max_backoff: int,
        raise_on_exception: bool,
        raise_on_error: bool,
    ):
        """Bulk index, update, delete docs to Elasticsearch."""

        # when using multiple threads for poll_db we need to account for other
        # threads performing deletions
        ignore_status: Tuple[int] = (400, 404)

        if ELASTICSEARCH_STREAMING_BULK:
            for _ in helpers.streaming_bulk(
                    self.__es,
                    docs,
                    index=index,
                    chunk_size=chunk_size,
                    max_chunk_bytes=max_chunk_bytes,
                    max_retries=max_retries,
                    max_backoff=max_backoff,
                    initial_backoff=initial_backoff,
                    refresh=refresh,
                    raise_on_exception=raise_on_exception,
                    raise_on_error=raise_on_error,
            ):
                self.doc_count += 1
        else:
            # parallel bulk consumes more memory and is also more likely
            # to result in 429 errors.
            for _ in helpers.parallel_bulk(
                    self.__es,
                    docs,
                    thread_count=thread_count,
                    chunk_size=chunk_size,
                    max_chunk_bytes=max_chunk_bytes,
                    queue_size=queue_size,
                    refresh=refresh,
                    raise_on_exception=raise_on_exception,
                    raise_on_error=raise_on_error,
                    ignore_status=ignore_status,
            ):
                self.doc_count += 1
    def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support drop database.")
            # dbs = self.command_helper.map_db(db)
            # for _db in dbs:
            #     self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            if db and coll:
                # Elasticsearch 7 remove type concept, need map to MongoDB table name to ES index
                index = '{db}_{tb}'.format(db=db.lower(), tb=coll)
                # self.elastic.indices.put_mapping(
                #     index=index, doc_type='_doc', body={"_source": {"enabled": True}}
                # )
                # self.elastic.indices.put_mapping(
                #     index=index, body={"_source" : {"enabled" : True}}
                # )

                # by pass table creation in mongodb, ES' index creation is in lazy module.
                warnings.warn(
                    "by pass table creation in mongodb, in ES7' index creation is as lazy module. %s on index %s."
                    % (coll, db))

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                # Elasticsearch 7 remove type concept, need map to MongoDB table name to ES index
                index = '{db}_{tb}'.format(db=db.lower(), tb=coll)
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type="delete") for result in scan(
                        self.elastic, index=index, doc_type='_doc')),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)
Пример #44
0
    def bulk_index(self,
                   records_uuids,
                   request_timeout=None,
                   max_chunk_bytes=None):
        """Starts bulk indexing for specified records

        Args:
            records_uuids(list[str): List of strings which are UUID's of records
                to reindex
            request_timeout(int): Maximum time after which es will throw an exception

        Returns:
            dict: dict with success count and failure list
                (with uuids of failed records)

        """
        if not request_timeout:
            request_timeout = current_app.config[
                "INDEXER_BULK_REQUEST_TIMEOUT"]
        max_chunk_bytes = max_chunk_bytes or 100 * 1014 * 1024  # default ES setting
        result = streaming_bulk(
            es,
            self.bulk_iterator(records_uuids),
            request_timeout=request_timeout,
            raise_on_error=False,
            raise_on_exception=False,
            expand_action_callback=(_es7_expand_action),
            max_retries=5,  # Retires on Error 429
            initial_backoff=10,  # wait for initial_backoff * 2^retry_number,
            max_chunk_bytes=max_chunk_bytes,
        )

        failures = []
        for action_success, action_data in result:
            if not action_success:
                failures.append({
                    "status_code":
                    action_data["index"]["status"],
                    "error_type":
                    str(get_value(action_data, "index.error.type", "")),
                    "falure_reason":
                    str(get_value(action_data, "index.error.reason", "")),
                })

        number_of_failures = len(failures)

        return {
            "uuids": records_uuids,
            "success_count": len(records_uuids) - number_of_failures,
            "failures_count": number_of_failures,
            "failures": failures,
        }
    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            if "Groups" in namespace:
                LOG.error("DEBUGG:: es bulk upsert groups _ids: %s" %
                          [x.get("_id") for x in docs])
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = str(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc),
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": namespace,
                        "_ts": timestamp
                    },
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw["chunk_size"] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error("Could not bulk-upsert document "
                              "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass
Пример #46
0
def merge_events(index_alias, events: List[Dict]):
    connection = es.connection
    try:
        # Index
        payload_length = len(events)
        index_start_time = time.time()
        actions = build_actions(events)
        updated, errors = [], []
        success, failed = 0, 0
        for ok, item in streaming_bulk(connection,
                                       actions,
                                       index=index_alias,
                                       _source=True):
            if not ok:
                errors.append(item)
                failed += 1
            else:
                updated.append(item["update"]["get"]["_source"])
                success += 1
        index_spent = time.time() - index_start_time
        logger.debug(
            f"--- Indexed {payload_length} in {index_spent} seconds, "
            f"Index latency: {(index_spent / payload_length) * 1000}ms ---")
        # Finalize
        if not failed:
            fanout(updated)
            return {"success": success}, 201
        else:
            return {
                "success": success,
                "failed": failed,
                "errors": errors
            }, 400
    except es_exceptions.ConnectionError:
        return responses.search_backend_unavailable
    except es_exceptions.RequestError as e:
        logger.error(e.info)
        return f"Request error", 409
    except bulk_errors.BulkIndexError as e:
        ignorable_errors = ["max_bytes_length_exceeded_exception"]
        for error in e.errors:
            try:
                err = error["update"]["error"]["caused_by"]["type"]
                if err in ignorable_errors:
                    logger.warning(
                        f"Payload caused an error {err} and leek did not index it!"
                    )
                    return "Processed", 201
            except KeyError:
                pass
        logger.error(e.errors)
        return f"Bulk update error", 409
Пример #47
0
    def test_transport_error_can_becaught(self):
        failing_client = FailingBulkClient(self.client)
        docs = [
            {
                "_index": "i",
                "_type": "_doc",
                "_id": 47,
                "f": "v"
            },
            {
                "_index": "i",
                "_type": "_doc",
                "_id": 45,
                "f": "v"
            },
            {
                "_index": "i",
                "_type": "_doc",
                "_id": 42,
                "f": "v"
            },
        ]

        results = list(
            helpers.streaming_bulk(
                failing_client,
                docs,
                raise_on_exception=False,
                raise_on_error=False,
                chunk_size=1,
            ))
        self.assertEquals(3, len(results))
        self.assertEquals([True, False, True], [r[0] for r in results])

        exc = results[1][1]["index"].pop("exception")
        self.assertIsInstance(exc, TransportError)
        self.assertEquals(599, exc.status_code)
        self.assertEquals(
            {
                "index": {
                    "_index": "i",
                    "_type": "_doc",
                    "_id": 45,
                    "data": {
                        "f": "v"
                    },
                    "error": "TransportError(599, 'Error!')",
                    "status": 599,
                }
            },
            results[1][1],
        )
Пример #48
0
def index_documents(path: str, name: str):
    """Use the streaming bulk API to index some documents"""
    # TODO: inject hostname
    es = Elasticsearch(hosts=[{'host': 'localhost', 'port': 49200}])
    for ok, result in streaming_bulk(es, file_iterable(path, name)):
        action, result = result.popitem()
        doc_id = '/%s/doc/%s' % (name, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)
Пример #49
0
def bulk_load(movies):
    all_ok = True
    es_movies = (q.as_elasticsearch_dict() for q in movies)
    for ok, result in streaming_bulk(get_client(),
                                     es_movies,
                                     index=settings.ES_INDEX,
                                     raise_on_error=False):
        #in this step for loop will log any error that occurs while loading the movie
        if not ok:
            all_ok = False
            action, result = result.popitem()
            logger.error(FAILED_TO_LOAD_ERROR.format(result['_id'], result))
    return all_ok
Пример #50
0
 def bulk_operation(cls, index=None, client=None, **options):
     for ok, result in streaming_bulk(
             client or cls.client,
             cls._bulk_stream(**options),
             index=index or cls.document._default_index(),
             raise_on_error=False,
             yield_ok=False,
             chunk_size=cls.data_bulk_limit
     ):
         if not ok:
             action, result = result.popitem()
             doc_id = '/%s/_doc/%s' % (index, result['_id'])
             logger.warning('Failed to {} document {}: {}'.format(action, doc_id, result))
Пример #51
0
    def index(self) -> bool:
        if not self.get_available_fields().count():
            self.task.info(self.task, "No hay series para indexar en este catálogo")
            return False

        index_ok = False
        for success, info in streaming_bulk(self.elastic, self.generate_actions()):
            if not success:
                self.task.info(self.task, 'Error indexando: {}'.format(info))
            else:
                index_ok = True

        return index_ok
def streaming_post_to_es(client, chunk, index_name, job_id=None, doc_type="transaction_mapping"):
    success, failed = 0, 0
    try:
        for ok, item in helpers.streaming_bulk(client, chunk, index=index_name, doc_type=doc_type):
            success = [success, success + 1][ok]
            failed = [failed + 1, failed][ok]

    except Exception as e:
        print("MASSIVE FAIL!!!\n\n{}\n\n{}".format(str(e)[:5000], "*" * 80))
        raise SystemExit(1)

    printf({"msg": "Success: {}, Fails: {}".format(success, failed), "job": job_id, "f": "ES Ingest"})
    return success, failed
Пример #53
0
def __add_meta_to_original_index(indices: List[str], index_fields: List[str], show_progress: ShowProgress, query: dict, scroll_size: int, elastic_wrapper: ElasticCore):
    index_elastic_search = ElasticSearcher(
        indices=indices,
        field_data=index_fields,
        callback_progress=show_progress,
        query=query,
        output=ElasticSearcher.OUT_RAW,
        scroll_size=scroll_size
    )
    index_actions = add_doc_uuid(generator=index_elastic_search)
    for success, info in streaming_bulk(client=elastic_wrapper.es, actions=index_actions, refresh="wait_for", chunk_size=scroll_size, max_retries=3):
        if not success:
            logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))
Пример #54
0
def es_index(es, index, gffdb, reader, doctype):
    checkindex(es, index)
    for ok, result in streaming_bulk(es,
                                     reader(gffdb),
                                     index=index,
                                     doc_type=doctype,
                                     chunk_size=chunksize):
        if not ok:
            action, result = result.popitem()
            doc_id = '/%s/commits/%s' % (args.index, result['_id'])
            print('Failed to %s document %s: %r' % (action, doc_id, result))
    es.indices.refresh(index=index)
    return
Пример #55
0
 def index_fixture_data(self, source_filepath, doc_cls):
     added_ids = []
     for ok, result in streaming_bulk(self.connection,
                                      self.prepare_data(
                                          source_filepath, doc_cls),
                                      refresh=True):
         action, result = result.popitem()
         if not ok:
             raise Exception("Failed to {} document {}: {}".format(
                 action, result["_id"], result))
         else:
             added_ids.append(result["_id"])
     return added_ids
Пример #56
0
def load(client, path='variants.tsv', index='variants'):
    create_variants_index(client, index)
    for ok, result in streaming_bulk(client,
                                     gen_variants(path),
                                     index=index,
                                     doc_type='variant',
                                     chunk_size=100):

        action, result = result.popitem()
        doc_id = '/%s/doc/%s' % (index, result['_id'])
        if not ok:
            raise Exception('Failed to %s document %s: %r' %
                            (action, doc_id, result))
Пример #57
0
def create_docs(client, items):
    success, failed = 0, 0
    for ok, result in streaming_bulk(client,
                                     items,
                                     index="judgment",
                                     doc_type="doc",
                                     max_retries=5,
                                     chunk_size=250):
        if not ok:
            failed += 1
        else:
            success += 1
    print(f"Created {success} indexes, with failed {failed}.")
Пример #58
0
    def index(self):
        """send csv to ES index"""
        self.logger.info('Setting up Elasticsearch index...')
        elastic = Elasticsearch(host=self.host, port=self.port, timeout=10000)
        try:
            self.logger.info('Creating index %s...' % self.index_name)
            elastic.indices.create(self.index_name, self.mapping)
        except RequestError:
            self.logger.info('Index already exists, skipping...')

        self.logger.info('Indexing %s...' % self.file)
        act = (self.format(choices, cid=cid) for cid, choices in self.csv_generator())
        list(streaming_bulk(elastic, actions=act))
Пример #59
0
 def save_es_actions(self, datasets_updates):
     dataset_model = apps.get_model('datasets.Dataset')
     for dataset_id, data in datasets_updates.items():
         dataset_model.objects.filter(pk=dataset_id).update(**data)
         try:
             self.views_es_actions['datasets'].append({
                 '_op_type':
                 'update',
                 '_index':
                 settings.ELASTICSEARCH_INDEX_NAMES['datasets'],
                 '_type':
                 'doc',
                 '_id':
                 dataset_id,
                 'doc':
                 data
             })
         except KeyError:
             self.views_es_actions['datasets'] = [{
                 '_op_type':
                 'update',
                 '_index':
                 settings.ELASTICSEARCH_INDEX_NAMES['datasets'],
                 '_type':
                 'doc',
                 '_id':
                 dataset_id,
                 'doc':
                 data
             }]
     es_actions = []
     for view_actions in self.views_es_actions.values():
         es_actions.extend(view_actions)
     streaming_bulk(connections.get_connection(),
                    es_actions,
                    raise_on_error=False,
                    raise_on_exception=False,
                    max_retries=2)
    def bulk_upsert(self, docs):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index = doc.pop("ns")
                doc_id = u(doc.pop("_id"))
                timestamp = doc.pop("_ts")
                document_action = {
                    "_index": index,
                    "_type": self.doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }
                yield document_action
                yield document_meta
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    logging.error("Could not bulk-upsert document "
                                  "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass