class KafkaProcessor(PillowProcessor): """ Processor that pushes changes to Kafka """ def __init__(self, kafka, data_source_type, data_source_name): self._kafka = kafka self._producer = ChangeProducer(self._kafka) self._data_source_type = data_source_type self._data_source_name = data_source_name def process_change(self, pillow_instance, change): try: document = change.get_document() doc_meta = get_doc_meta_object_from_document(document) change_meta = change_meta_from_doc_meta_and_document( doc_meta=doc_meta, document=document, data_source_type=self._data_source_type, data_source_name=self._data_source_name, doc_id=change.id, ) except MissingMetaInformationError: pass else: # change.deleted is used for hard deletions whereas change_meta.is_deletion is for soft deletions. # from the consumer's perspective both should be counted as deletions so just "or" them # note: it is strange and hard to reproduce that the couch changes feed is providing a "doc" # along with a hard deletion, but it is doing that in the wild so we might as well support it. change_meta.is_deletion = change_meta.is_deletion or change.deleted self._producer.send_change(get_topic(doc_meta), change_meta)
class KafkaProcessor(PillowProcessor): """Generic processor for CouchDB changes to put those changes in a kafka topic Reads from: - CouchDB change feed Writes to: - Specified kafka topic """ def __init__(self, data_source_type, data_source_name, default_topic): self._producer = ChangeProducer() self._data_source_type = data_source_type self._data_source_name = data_source_name self._default_topic = default_topic def process_change(self, change): populate_change_metadata(change, self._data_source_type, self._data_source_name) if change.metadata: change_meta = change.metadata # change.deleted is used for hard deletions whereas change_meta.is_deletion is for soft deletions. # from the consumer's perspective both should be counted as deletions so just "or" them # note: it is strange and hard to reproduce that the couch changes feed is providing a "doc" # along with a hard deletion, but it is doing that in the wild so we might as well support it. change_meta.is_deletion = change_meta.is_deletion or change.deleted if change_meta.is_deletion: # If a change has been hard deleted, set a default topic because we may # not be able to retrieve its correct doc type topic = get_topic_for_doc_type(change_meta.document_type, self._data_source_type, self._default_topic) else: topic = get_topic_for_doc_type(change_meta.document_type, self._data_source_type) self._producer.send_change(topic, change_meta)
def handle(self, pillow, **options): self.pool = Pool(10) self.pillow = pillow self.count = 0 self.start = time.time() self.producer = ChangeProducer(auto_flush=False) for errors in self.get_next_errors(): self.pool.spawn(self._process_errors, errors)
class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('pillow_name') def handle(self, pillow_name, **options): self.pool = Pool(10) self.pillow_name = pillow_name try: pillow = get_pillow_by_name(pillow_name) except PillowNotFoundError: raise CommandError(f"Unknown pillow: {pillow_name}") if not isinstance(pillow.get_change_feed(), KafkaChangeFeed): raise CommandError(f"Only Kafka pillows are supported") self.count = 0 self.start = time.time() self.producer = ChangeProducer(auto_flush=False) for errors in self.get_next_errors(): self.pool.spawn(self._process_errors, errors) def get_next_errors(self): num_retrieved = 1 while num_retrieved > 0: pillow_errors = (PillowError.objects.filter( pillow=self.pillow_name).order_by('date_next_attempt'))[:1000] num_retrieved = len(pillow_errors) yield pillow_errors while not self.pool.wait_available(timeout=10): time.sleep(1) while not self.pool.join(timeout=10): print('Waiting for tasks to complete') def _process_errors(self, errors): for error in errors: _process_kafka_change(self.producer, error) self.producer.flush() self._delete_errors(errors) self.count += 1000 duration = time.time() - self.start print('Processed {} in {}s: {} per s'.format( self.count, duration, self.count / duration if duration else self.count)) print(datetime.utcnow()) def _delete_errors(self, errors): doc_ids = [error.doc_id for error in errors] PillowError.objects.filter(doc_id__in=doc_ids).delete()
def test_error_asynchronous(self): kafka_producer = ChangeProducer(auto_flush=False) future = Future() kafka_producer.producer.send = Mock(return_value=future) meta = ChangeMeta( document_id=uuid.uuid4().hex, data_source_type='dummy-type', data_source_name='dummy-name' ) with capture_log_output(KAFKA_AUDIT_LOGGER) as logs: kafka_producer.send_change(topics.CASE, meta) future.failure(Exception()) self._check_logs(logs, meta.document_id, [CHANGE_PRE_SEND, CHANGE_ERROR])
class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('pillow') def handle(self, pillow, **options): self.pool = Pool(10) self.pillow = pillow self.count = 0 self.start = time.time() self.producer = ChangeProducer(auto_flush=False) for errors in self.get_next_errors(): self.pool.spawn(self._process_errors, errors) def get_next_errors(self): num_retrieved = 1 while num_retrieved > 0: pillow_errors = (PillowError.objects.filter( pillow=self.pillow).order_by('date_next_attempt'))[:1000] num_retrieved = len(pillow_errors) yield pillow_errors while not self.pool.wait_available(timeout=10): time.sleep(1) while not self.pool.join(timeout=10): print('Waiting for tasks to complete') def _process_errors(self, errors): for error in errors: if error.change_object.metadata: self.producer.send_change( error.change_object.metadata.data_source_name, error.change_object.metadata) self.producer.flush() self._delete_errors(errors) self.count += 1000 duration = time.time() - self.start print('Processed {} in {}s: {} per s'.format( self.count, duration, self.count / duration if duration else self.count)) print(datetime.utcnow()) def _delete_errors(self, errors): doc_ids = [error.doc_id for error in errors] PillowError.objects.filter(doc_id__in=doc_ids).delete()
def test_error_synchronous(self): kafka_producer = ChangeProducer() future = Future() future.get = Mock(side_effect=Exception()) kafka_producer.producer.send = Mock(return_value=future) meta = ChangeMeta( document_id=uuid.uuid4().hex, data_source_type='dummy-type', data_source_name='dummy-name' ) with capture_log_output(KAFKA_AUDIT_LOGGER) as logs: with self.assertRaises(Exception): kafka_producer.send_change(topics.CASE, meta) self._check_logs(logs, meta.document_id, [CHANGE_PRE_SEND, CHANGE_ERROR])
def handle(self, pillow_name, **options): self.pool = Pool(10) self.pillow_name = pillow_name try: pillow = get_pillow_by_name(pillow_name) except PillowNotFoundError: raise CommandError(f"Unknown pillow: {pillow_name}") if not isinstance(pillow.get_change_feed(), KafkaChangeFeed): raise CommandError(f"Only Kafka pillows are supported") self.count = 0 self.start = time.time() self.producer = ChangeProducer(auto_flush=False) for errors in self.get_next_errors(): self.pool.spawn(self._process_errors, errors)
class KafkaProcessor(PillowProcessor): """ Processor that pushes changes to Kafka """ def __init__(self, data_source_type, data_source_name): self._producer = ChangeProducer() self._data_source_type = data_source_type self._data_source_name = data_source_name def process_change(self, change): populate_change_metadata(change, self._data_source_type, self._data_source_name) if change.metadata: change_meta = change.metadata topic = get_topic_for_doc_type(change_meta.document_type, self._data_source_type) # change.deleted is used for hard deletions whereas change_meta.is_deletion is for soft deletions. # from the consumer's perspective both should be counted as deletions so just "or" them # note: it is strange and hard to reproduce that the couch changes feed is providing a "doc" # along with a hard deletion, but it is doing that in the wild so we might as well support it. change_meta.is_deletion = change_meta.is_deletion or change.deleted self._producer.send_change(topic, change_meta)
def _test_success(self, auto_flush): kafka_producer = ChangeProducer(auto_flush=auto_flush) with capture_log_output(KAFKA_AUDIT_LOGGER) as logs: meta = ChangeMeta(document_id=uuid.uuid4().hex, data_source_type='dummy-type', data_source_name='dummy-name') kafka_producer.send_change(topics.CASE, meta) if not auto_flush: kafka_producer.flush() self._check_logs(logs, meta.document_id, [CHANGE_PRE_SEND, CHANGE_SENT])
class KafkaProcessor(PillowProcessor): """ Processor that pushes changes to Kafka """ def __init__(self, kafka, data_source_type, data_source_name): self._kafka = kafka self._producer = ChangeProducer(self._kafka) self._data_source_type = data_source_type self._data_source_name = data_source_name def process_change(self, pillow_instance, change, do_set_checkpoint=False): document_type = _get_document_type(change.document) if document_type: assert change.document is not None change_meta = ChangeMeta( document_id=change.id, data_source_type=self._data_source_type, data_source_name=self._data_source_name, document_type=document_type, document_subtype=_get_document_subtype(change.document), domain=change.document.get('domain', None), is_deletion=change.deleted, ) self._producer.send_change(get_topic(document_type), change_meta)
def __init__(self, data_source_type, data_source_name): self._producer = ChangeProducer() self._data_source_type = data_source_type self._data_source_name = data_source_name
def __init__(self, kafka, data_source_type, data_source_name): self._kafka = kafka self._producer = ChangeProducer(self._kafka) self._data_source_type = data_source_type self._data_source_name = data_source_name
def __init__(self, data_source_type, data_source_name, default_topic): self._producer = ChangeProducer() self._data_source_type = data_source_type self._data_source_name = data_source_name self._default_topic = default_topic
from django import db from django.core.management import BaseCommand import pytz from psycopg2._psycopg import InterfaceError from dimagi.utils.logging import notify_exception from pillow_retry.api import process_pillow_retry from pillow_retry.models import PillowError from corehq.apps.change_feed.producer import ChangeProducer from corehq.sql_db.util import handle_connection_failure BATCH_SIZE = 10000 producer = ChangeProducer(auto_flush=False) class PillowRetryEnqueuingOperation(BaseCommand): help = "Runs the Pillow Retry Queue" def handle(self, **options): while True: try: num_processed = self.process_queue() except Exception: num_processed = 0 notify_exception(None, message="Could not fetch due survey actions") sleep_time = 10 if num_processed < BATCH_SIZE else 0 sleep(sleep_time)