예제 #1
0
class KafkaProcessor(PillowProcessor):
    """
    Processor that pushes changes to Kafka
    """

    def __init__(self, kafka, data_source_type, data_source_name):
        self._kafka = kafka
        self._producer = ChangeProducer(self._kafka)
        self._data_source_type = data_source_type
        self._data_source_name = data_source_name

    def process_change(self, pillow_instance, change):
        try:
            document = change.get_document()
            doc_meta = get_doc_meta_object_from_document(document)
            change_meta = change_meta_from_doc_meta_and_document(
                doc_meta=doc_meta,
                document=document,
                data_source_type=self._data_source_type,
                data_source_name=self._data_source_name,
                doc_id=change.id,
            )
        except MissingMetaInformationError:
            pass
        else:
            # change.deleted is used for hard deletions whereas change_meta.is_deletion is for soft deletions.
            # from the consumer's perspective both should be counted as deletions so just "or" them
            # note: it is strange and hard to reproduce that the couch changes feed is providing a "doc"
            # along with a hard deletion, but it is doing that in the wild so we might as well support it.
            change_meta.is_deletion = change_meta.is_deletion or change.deleted
            self._producer.send_change(get_topic(doc_meta), change_meta)
예제 #2
0
class KafkaProcessor(PillowProcessor):
    """Generic processor for CouchDB changes to put those changes in a kafka topic

    Reads from:
      - CouchDB change feed

    Writes to:
      - Specified kafka topic
    """
    def __init__(self, data_source_type, data_source_name, default_topic):
        self._producer = ChangeProducer()
        self._data_source_type = data_source_type
        self._data_source_name = data_source_name
        self._default_topic = default_topic

    def process_change(self, change):
        populate_change_metadata(change, self._data_source_type,
                                 self._data_source_name)
        if change.metadata:
            change_meta = change.metadata
            # change.deleted is used for hard deletions whereas change_meta.is_deletion is for soft deletions.
            # from the consumer's perspective both should be counted as deletions so just "or" them
            # note: it is strange and hard to reproduce that the couch changes feed is providing a "doc"
            # along with a hard deletion, but it is doing that in the wild so we might as well support it.
            change_meta.is_deletion = change_meta.is_deletion or change.deleted
            if change_meta.is_deletion:
                # If a change has been hard deleted, set a default topic because we may
                # not be able to retrieve its correct doc type
                topic = get_topic_for_doc_type(change_meta.document_type,
                                               self._data_source_type,
                                               self._default_topic)
            else:
                topic = get_topic_for_doc_type(change_meta.document_type,
                                               self._data_source_type)
            self._producer.send_change(topic, change_meta)
예제 #3
0
class KafkaProcessor(PillowProcessor):
    """
    Processor that pushes changes to Kafka
    """

    def __init__(self, kafka, data_source_type, data_source_name):
        self._kafka = kafka
        self._producer = ChangeProducer(self._kafka)
        self._data_source_type = data_source_type
        self._data_source_name = data_source_name

    def process_change(self, pillow_instance, change):
        try:
            document = change.get_document()
            doc_meta = get_doc_meta_object_from_document(document)
            change_meta = change_meta_from_doc_meta_and_document(
                doc_meta=doc_meta,
                document=document,
                data_source_type=self._data_source_type,
                data_source_name=self._data_source_name,
                doc_id=change.id,
            )
        except MissingMetaInformationError:
            pass
        else:
            # change.deleted is used for hard deletions whereas change_meta.is_deletion is for soft deletions.
            # from the consumer's perspective both should be counted as deletions so just "or" them
            # note: it is strange and hard to reproduce that the couch changes feed is providing a "doc"
            # along with a hard deletion, but it is doing that in the wild so we might as well support it.
            change_meta.is_deletion = change_meta.is_deletion or change.deleted
            self._producer.send_change(get_topic(doc_meta), change_meta)
예제 #4
0
    def handle(self, pillow, **options):
        self.pool = Pool(10)
        self.pillow = pillow
        self.count = 0
        self.start = time.time()
        self.producer = ChangeProducer(auto_flush=False)

        for errors in self.get_next_errors():
            self.pool.spawn(self._process_errors, errors)
class Command(BaseCommand):
    def add_arguments(self, parser):
        parser.add_argument('pillow_name')

    def handle(self, pillow_name, **options):
        self.pool = Pool(10)
        self.pillow_name = pillow_name

        try:
            pillow = get_pillow_by_name(pillow_name)
        except PillowNotFoundError:
            raise CommandError(f"Unknown pillow: {pillow_name}")

        if not isinstance(pillow.get_change_feed(), KafkaChangeFeed):
            raise CommandError(f"Only Kafka pillows are supported")

        self.count = 0
        self.start = time.time()
        self.producer = ChangeProducer(auto_flush=False)

        for errors in self.get_next_errors():
            self.pool.spawn(self._process_errors, errors)

    def get_next_errors(self):
        num_retrieved = 1

        while num_retrieved > 0:
            pillow_errors = (PillowError.objects.filter(
                pillow=self.pillow_name).order_by('date_next_attempt'))[:1000]

            num_retrieved = len(pillow_errors)
            yield pillow_errors

            while not self.pool.wait_available(timeout=10):
                time.sleep(1)

        while not self.pool.join(timeout=10):
            print('Waiting for tasks to complete')

    def _process_errors(self, errors):
        for error in errors:
            _process_kafka_change(self.producer, error)

        self.producer.flush()

        self._delete_errors(errors)
        self.count += 1000
        duration = time.time() - self.start
        print('Processed {} in {}s: {} per s'.format(
            self.count, duration,
            self.count / duration if duration else self.count))
        print(datetime.utcnow())

    def _delete_errors(self, errors):
        doc_ids = [error.doc_id for error in errors]
        PillowError.objects.filter(doc_id__in=doc_ids).delete()
    def test_error_asynchronous(self):
        kafka_producer = ChangeProducer(auto_flush=False)
        future = Future()
        kafka_producer.producer.send = Mock(return_value=future)

        meta = ChangeMeta(
            document_id=uuid.uuid4().hex, data_source_type='dummy-type', data_source_name='dummy-name'
        )

        with capture_log_output(KAFKA_AUDIT_LOGGER) as logs:
            kafka_producer.send_change(topics.CASE, meta)
            future.failure(Exception())

        self._check_logs(logs, meta.document_id, [CHANGE_PRE_SEND, CHANGE_ERROR])
예제 #7
0
class Command(BaseCommand):
    def add_arguments(self, parser):
        parser.add_argument('pillow')

    def handle(self, pillow, **options):
        self.pool = Pool(10)
        self.pillow = pillow
        self.count = 0
        self.start = time.time()
        self.producer = ChangeProducer(auto_flush=False)

        for errors in self.get_next_errors():
            self.pool.spawn(self._process_errors, errors)

    def get_next_errors(self):
        num_retrieved = 1

        while num_retrieved > 0:
            pillow_errors = (PillowError.objects.filter(
                pillow=self.pillow).order_by('date_next_attempt'))[:1000]

            num_retrieved = len(pillow_errors)
            yield pillow_errors

            while not self.pool.wait_available(timeout=10):
                time.sleep(1)

        while not self.pool.join(timeout=10):
            print('Waiting for tasks to complete')

    def _process_errors(self, errors):
        for error in errors:
            if error.change_object.metadata:
                self.producer.send_change(
                    error.change_object.metadata.data_source_name,
                    error.change_object.metadata)
        self.producer.flush()

        self._delete_errors(errors)
        self.count += 1000
        duration = time.time() - self.start
        print('Processed {} in {}s: {} per s'.format(
            self.count, duration,
            self.count / duration if duration else self.count))
        print(datetime.utcnow())

    def _delete_errors(self, errors):
        doc_ids = [error.doc_id for error in errors]
        PillowError.objects.filter(doc_id__in=doc_ids).delete()
    def test_error_synchronous(self):
        kafka_producer = ChangeProducer()
        future = Future()
        future.get = Mock(side_effect=Exception())
        kafka_producer.producer.send = Mock(return_value=future)

        meta = ChangeMeta(
            document_id=uuid.uuid4().hex, data_source_type='dummy-type', data_source_name='dummy-name'
        )

        with capture_log_output(KAFKA_AUDIT_LOGGER) as logs:
            with self.assertRaises(Exception):
                kafka_producer.send_change(topics.CASE, meta)

        self._check_logs(logs, meta.document_id, [CHANGE_PRE_SEND, CHANGE_ERROR])
    def handle(self, pillow_name, **options):
        self.pool = Pool(10)
        self.pillow_name = pillow_name

        try:
            pillow = get_pillow_by_name(pillow_name)
        except PillowNotFoundError:
            raise CommandError(f"Unknown pillow: {pillow_name}")

        if not isinstance(pillow.get_change_feed(), KafkaChangeFeed):
            raise CommandError(f"Only Kafka pillows are supported")

        self.count = 0
        self.start = time.time()
        self.producer = ChangeProducer(auto_flush=False)

        for errors in self.get_next_errors():
            self.pool.spawn(self._process_errors, errors)
예제 #10
0
class KafkaProcessor(PillowProcessor):
    """
    Processor that pushes changes to Kafka
    """

    def __init__(self, data_source_type, data_source_name):
        self._producer = ChangeProducer()
        self._data_source_type = data_source_type
        self._data_source_name = data_source_name

    def process_change(self, change):
        populate_change_metadata(change, self._data_source_type, self._data_source_name)
        if change.metadata:
            change_meta = change.metadata
            topic = get_topic_for_doc_type(change_meta.document_type, self._data_source_type)
            # change.deleted is used for hard deletions whereas change_meta.is_deletion is for soft deletions.
            # from the consumer's perspective both should be counted as deletions so just "or" them
            # note: it is strange and hard to reproduce that the couch changes feed is providing a "doc"
            # along with a hard deletion, but it is doing that in the wild so we might as well support it.
            change_meta.is_deletion = change_meta.is_deletion or change.deleted
            self._producer.send_change(topic, change_meta)
 def _test_success(self, auto_flush):
     kafka_producer = ChangeProducer(auto_flush=auto_flush)
     with capture_log_output(KAFKA_AUDIT_LOGGER) as logs:
         meta = ChangeMeta(document_id=uuid.uuid4().hex, data_source_type='dummy-type',
                           data_source_name='dummy-name')
         kafka_producer.send_change(topics.CASE, meta)
         if not auto_flush:
             kafka_producer.flush()
     self._check_logs(logs, meta.document_id, [CHANGE_PRE_SEND, CHANGE_SENT])
예제 #12
0
class KafkaProcessor(PillowProcessor):
    """
    Processor that pushes changes to Kafka
    """
    def __init__(self, kafka, data_source_type, data_source_name):
        self._kafka = kafka
        self._producer = ChangeProducer(self._kafka)
        self._data_source_type = data_source_type
        self._data_source_name = data_source_name

    def process_change(self, pillow_instance, change, do_set_checkpoint=False):
        document_type = _get_document_type(change.document)
        if document_type:
            assert change.document is not None
            change_meta = ChangeMeta(
                document_id=change.id,
                data_source_type=self._data_source_type,
                data_source_name=self._data_source_name,
                document_type=document_type,
                document_subtype=_get_document_subtype(change.document),
                domain=change.document.get('domain', None),
                is_deletion=change.deleted,
            )
            self._producer.send_change(get_topic(document_type), change_meta)
예제 #13
0
 def __init__(self, data_source_type, data_source_name):
     self._producer = ChangeProducer()
     self._data_source_type = data_source_type
     self._data_source_name = data_source_name
예제 #14
0
 def __init__(self, kafka, data_source_type, data_source_name):
     self._kafka = kafka
     self._producer = ChangeProducer(self._kafka)
     self._data_source_type = data_source_type
     self._data_source_name = data_source_name
예제 #15
0
 def __init__(self, data_source_type, data_source_name, default_topic):
     self._producer = ChangeProducer()
     self._data_source_type = data_source_type
     self._data_source_name = data_source_name
     self._default_topic = default_topic
예제 #16
0
from django import db
from django.core.management import BaseCommand

import pytz
from psycopg2._psycopg import InterfaceError

from dimagi.utils.logging import notify_exception
from pillow_retry.api import process_pillow_retry
from pillow_retry.models import PillowError

from corehq.apps.change_feed.producer import ChangeProducer
from corehq.sql_db.util import handle_connection_failure

BATCH_SIZE = 10000

producer = ChangeProducer(auto_flush=False)


class PillowRetryEnqueuingOperation(BaseCommand):
    help = "Runs the Pillow Retry Queue"

    def handle(self, **options):
        while True:
            try:
                num_processed = self.process_queue()
            except Exception:
                num_processed = 0
                notify_exception(None,
                                 message="Could not fetch due survey actions")
            sleep_time = 10 if num_processed < BATCH_SIZE else 0
            sleep(sleep_time)
예제 #17
0
 def __init__(self, kafka, data_source_type, data_source_name):
     self._kafka = kafka
     self._producer = ChangeProducer(self._kafka)
     self._data_source_type = data_source_type
     self._data_source_name = data_source_name