Exemplo n.º 1
0
 def bulk_builder(self, changes):
     """
     http://www.elasticsearch.org/guide/reference/api/bulk.html
     bulk loader follows the following:
     { "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }\n
     { "field1" : "value1" }\n
     """
     for change in changes:
         try:
             with lock_manager(self.change_trigger(change)) as t:
                 if t is not None:
                     tr = self.change_transform(t)
                     if tr is not None:
                         self.change_transport(tr)
                         yield {
                             "index": {
                                 "_index": self.es_index,
                                 "_type": self.es_type,
                                 "_id": tr['_id']
                             }
                         }
                         yield tr
         except Exception, ex:
             pillow_logging.error(
                 "Error on change: %s, %s" % (change['id'], ex)
             )
Exemplo n.º 2
0
    def process_bulk_docs(self, docs):
        if len(docs) == 0:
            return True

        pillow_logging.info("Processing batch of %s docs", len((docs)))

        changes = [self._doc_to_change(doc) for doc in docs]
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error procesing doc %s: %s", change.id, exception)

        max_payload_size = pow(10, 8)  # ~ 100Mb
        payloads = prepare_bulk_payloads(bulk_changes, max_payload_size)
        if len(payloads) > 1:
            pillow_logging.info("Payload split into %s parts" % len(payloads))

        for payload in payloads:
            success = self._send_payload_with_retries(payload)
            if not success:
                # stop the reindexer if we're unable to send a payload to ES
                return False

        return True
Exemplo n.º 3
0
    def process_bulk_docs(self, docs, progress_logger):
        if len(docs) == 0:
            return True

        pillow_logging.info("Processing batch of %s docs", len(docs))

        changes = [
            self._doc_to_change(doc) for doc in docs
            if self.process_deletes or not is_deletion(doc.get('doc_type'))
        ]
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(self.index_info, changes,
                                          self.doc_transform, error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error procesing doc %s: %s (%s)", change.id,
                                 type(exception), exception)

        es_interface = ElasticsearchInterface(self.es)
        try:
            es_interface.bulk_ops(bulk_changes)
        except (ESBulkIndexError, ES2BulkIndexError, ES7BulkIndexError) as e:
            pillow_logging.error("Bulk index errors\n%s", e.errors)
        except Exception:
            pillow_logging.exception("\tException sending payload to ES")
            return False

        return True
Exemplo n.º 4
0
def send_to_elasticsearch(path, es_getter, name, data=None, retries=MAX_RETRIES,
        except_on_failure=False, update=False, delete=False):
    """
    More fault tolerant es.put method
    """
    data = data if data is not None else {}
    current_tries = 0
    while current_tries < retries:
        try:
            if delete:
                res = es_getter().delete(path=path)
            elif update:
                params = {'retry_on_conflict': 2}
                res = es_getter().post("%s/_update" % path, data={"doc": data}, params=params)
            else:
                res = es_getter().put(path, data=data)
            break
        except ConnectionError, ex:
            current_tries += 1
            pillow_logging.error("[%s] put_robust error %s attempt %d/%d" % (
                name, ex, current_tries, retries))
            time.sleep(math.pow(RETRY_INTERVAL, current_tries))

            if current_tries == retries:
                message = "[%s] Max retry error on %s" % (name, path)
                if except_on_failure:
                    raise PillowtopIndexingError(message)
                else:
                    pillow_logging.error(message)
                res = {}
Exemplo n.º 5
0
    def change_transport(self, doc_dict):
        """
        Save the document to ElasticSearch
        """
        try:
            if not self.bulk:
                doc_exists_val = doc_exists(self, doc_dict)

                if self.allow_updates:
                    can_put = True
                else:
                    can_put = not doc_exists_val

                if can_put and not self.bulk:
                    self.send_robust(doc_dict, update=doc_exists_val)
        except Exception, ex:
            tb = traceback.format_exc()
            pillow_logging.error(
                "PillowTop [%(pillow_name)s]: Aliased Elastic Pillow transport change data doc_id: %(doc_id)s to elasticsearch error: %(error)s\ntraceback: %(tb)s\n" %
                {
                    "pillow_name": self.get_name(),
                    "doc_id": doc_dict['_id'],
                    "error": ex,
                    "tb": tb
                }
            )
            return None
Exemplo n.º 6
0
    def change_transport(self, doc_dict):
        """
        Override the elastic transport to go to the index + the type being a string between the
        domain and case type
        """
        try:
            if not self.bulk:
                doc_path = self.get_doc_path_typed(doc_dict)

                doc_exists = self.doc_exists(doc_dict)

                if self.allow_updates:
                    can_put = True
                else:
                    can_put = not doc_exists

                if can_put and not self.bulk:
                    res = self.send_robust(doc_path, data=doc_dict, update=doc_exists)
                    return res
        except Exception, ex:
            tb = traceback.format_exc()
            pillow_logging.error(
                "PillowTop [%(pillow_name)s]: Aliased Elastic Pillow transport change data doc_id: %(doc_id)s to elasticsearch error: %(error)s\ntraceback: %(tb)s\n" %
                {
                    "pillow_name": self.get_name(),
                    "doc_id": doc_dict['_id'],
                    "error": ex,
                    "tb": tb
                }
            )
            return None
Exemplo n.º 7
0
    def process_bulk_docs(self, docs):
        if len(docs) == 0:
            return True

        pillow_logging.info("Processing batch of %s docs", len((docs)))

        changes = [self._doc_to_change(doc) for doc in docs]
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error procesing doc %s: %s", change.id, exception)

        payloads = prepare_bulk_payloads(bulk_changes, MAX_PAYLOAD_SIZE)
        if len(payloads) > 1:
            pillow_logging.info("Payload split into %s parts" % len(payloads))

        for payload in payloads:
            success = self._send_payload_with_retries(payload)
            if not success:
                # stop the reindexer if we're unable to send a payload to ES
                return False

        return True
Exemplo n.º 8
0
    def process_bulk_docs(self, docs, progress_logger):
        if not docs:
            return True

        pillow_logging.info("Processing batch of %s docs", len(docs))
        changes = []
        for doc in docs:
            change = self._doc_to_change(doc)  # de-dupe the is_deletion check
            if self.process_deletes or not change.deleted:
                changes.append(change)
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(changes, self.doc_transform,
                                          error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error processing doc %s: %s (%s)", change.id,
                                 type(exception), exception)

        es_interface = ElasticsearchInterface(self.es)
        try:
            es_interface.bulk_ops(self.index_info.alias, self.index_info.type,
                                  bulk_changes)
        except BulkIndexError as e:
            pillow_logging.error("Bulk index errors\n%s", e.errors)
        except Exception as exc:
            pillow_logging.exception(
                "Error sending bulk payload to Elasticsearch: %s", exc)
            return False

        return True
Exemplo n.º 9
0
 def change_trigger(self, changes_dict):
     id = changes_dict['id']
     if changes_dict.get('deleted', False):
         try:
             if doc_exists(self, id):
                 self.get_es_new().delete(self.es_index, self.es_type, id)
         except Exception, ex:
             pillow_logging.error(
                 "ElasticPillow: error deleting route %s - ignoring: %s" % (
                     self.get_doc_path(changes_dict['id']),
                     ex,
                 )
             )
         return None
Exemplo n.º 10
0
def send_to_elasticsearch(index,
                          doc_type,
                          doc_id,
                          es_getter,
                          name,
                          data=None,
                          retries=MAX_RETRIES,
                          except_on_failure=False,
                          update=False,
                          delete=False):
    """
    More fault tolerant es.put method
    """
    data = data if data is not None else {}
    current_tries = 0
    while current_tries < retries:
        try:
            if delete:
                es_getter().delete(index, doc_type, doc_id)
            elif update:
                params = {'retry_on_conflict': 2}
                es_getter().index(index,
                                  doc_type,
                                  body=data,
                                  id=doc_id,
                                  params=params)
            else:
                es_getter().create(index, doc_type, body=data, id=doc_id)
            break
        except ConnectionError as ex:
            current_tries += 1
            pillow_logging.error("[%s] put_robust error %s attempt %d/%d" %
                                 (name, ex, current_tries, retries))

            if current_tries == retries:
                message = "[%s] Max retry error on %s/%s/%s" % (
                    name, index, doc_type, doc_id)
                if except_on_failure:
                    raise PillowtopIndexingError(message)
                else:
                    pillow_logging.error(message)

            time.sleep(math.pow(RETRY_INTERVAL, current_tries))
        except RequestError as ex:
            error_message = "Pillowtop put_robust error [%s]:\n%s\n\tpath: %s/%s/%s\n\t%s" % (
                name, ex.error
                or "No error message", index, doc_type, doc_id, data.keys())

            if except_on_failure:
                raise PillowtopIndexingError(error_message)
            else:
                pillow_logging.error(error_message)
            break
        except ConflictError:
            break  # ignore the error if a doc already exists when trying to create it in the index
        except NotFoundError:
            break
Exemplo n.º 11
0
    def process_bulk_docs(self, docs):
        if len(docs) == 0:
            return True

        pillow_logging.info("Processing batch of %s docs", len((docs)))

        changes = [self._doc_to_change(doc) for doc in docs]
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(self.index_info, changes,
                                          self.doc_transform, error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error procesing doc %s: %s (%s)", change.id,
                                 type(exception), exception)

        es_interface = ElasticsearchInterface(self.es)
        try:
            es_interface.bulk_ops(bulk_changes)
        except Exception:
            pillow_logging.exception("\tException sending payload to ES")
            return False

        return True
Exemplo n.º 12
0
def send_to_elasticsearch(index, doc_type, doc_id, es_getter, name, data=None,
                          retries=MAX_RETRIES, propagate_failure=settings.UNIT_TESTING,
                          update=False, delete=False, es_merge_update=False):
    """
    More fault tolerant es.put method
    kwargs:
        es_merge_update: Set this to True to use Elasticsearch.update instead of Elasticsearch.index
            which merges existing ES doc and current update. If this is set to False, the doc will be replaced

    """
    data = data if data is not None else {}
    current_tries = 0
    es_interface = ElasticsearchInterface(es_getter())
    retries = 1 if settings.UNIT_TESTING else retries
    while current_tries < retries:
        try:
            if delete:
                es_interface.delete_doc(index, doc_type, doc_id)
            elif update:
                params = {'retry_on_conflict': 2}
                if es_merge_update:
                    es_interface.update_doc_fields(index, doc_type, doc_id, fields=data, params=params)
                else:
                    es_interface.update_doc(index, doc_type, doc_id, doc=data, params=params)
            else:
                es_interface.create_doc(index, doc_type, doc_id, doc=data)
            break
        except ConnectionError as ex:
            current_tries += 1
            pillow_logging.error("[{}] put_robust error {} attempt {}/{}".format(
                name, ex, current_tries, retries))

            if current_tries == retries:
                message = "[{}] Max retry error on {}/{}/{}:\n\n{}".format(
                    name, index, doc_type, doc_id, traceback.format_exc())
                if propagate_failure:
                    raise PillowtopIndexingError(message)
                else:
                    pillow_logging.error(message)

            time.sleep(math.pow(RETRY_INTERVAL, current_tries))
        except RequestError:
            error_message = (
                "Pillowtop put_robust error [{}]:\n\n{}\n\tpath: {}/{}/{}\n\t{}".format(
                    name, traceback.format_exc(), index, doc_type, doc_id, list(data))
            )

            if propagate_failure:
                raise PillowtopIndexingError(error_message)
            else:
                pillow_logging.error(error_message)
            break
        except ConflictError:
            break  # ignore the error if a doc already exists when trying to create it in the index
        except NotFoundError:
            break
Exemplo n.º 13
0
def send_to_elasticsearch(index, doc_type, doc_id, es_getter, name, data=None, retries=MAX_RETRIES,
                          except_on_failure=False, update=False, delete=False):
    """
    More fault tolerant es.put method
    """
    data = data if data is not None else {}
    current_tries = 0
    while current_tries < retries:
        try:
            if delete:
                es_getter().delete(index, doc_type, doc_id)
            elif update:
                params = {'retry_on_conflict': 2}
                es_getter().update(index, doc_type, doc_id, body={"doc": data}, params=params)
            else:
                es_getter().create(index, doc_type, body=data, id=doc_id)
            break
        except ConnectionError, ex:
            current_tries += 1
            pillow_logging.error("[%s] put_robust error %s attempt %d/%d" % (
                name, ex, current_tries, retries))

            if current_tries == retries:
                message = "[%s] Max retry error on %s/%s/%s" % (name, index, doc_type, doc_id)
                if except_on_failure:
                    raise PillowtopIndexingError(message)
                else:
                    pillow_logging.error(message)

            time.sleep(math.pow(RETRY_INTERVAL, current_tries))
        except RequestError as ex:
            error_message = "Pillowtop put_robust error [%s]:\n%s\n\tpath: %s/%s/%s\n\t%s" % (
                name,
                ex.error or "No error message",
                index, doc_type, doc_id,
                data.keys())

            if except_on_failure:
                raise PillowtopIndexingError(error_message)
            else:
                pillow_logging.error(error_message)
            break
Exemplo n.º 14
0
                    raise PillowtopIndexingError(message)
                else:
                    pillow_logging.error(message)
                res = {}

    if res.get('status', 0) == 400:
        error_message = "Pillowtop put_robust error [%s]:\n%s\n\tpath: %s\n\t%s" % (
            name,
            res.get('error', "No error message"),
            path,
            data.keys())

        if except_on_failure:
            raise PillowtopIndexingError(error_message)
        else:
            pillow_logging.error(error_message)
    return res


class AliasedElasticPillow(BasicPillow):
    """
    This pillow class defines it as being alias-able. That is, when you query it, you use an
    Alias to access it.

    This could be for varying reasons -  to make an index by certain metadata into its own index
    for performance/separation reasons

    Or, for our initial use case, needing to version/update the index mappings on the fly with
    minimal disruption.