def create_buffer_table(client: ClickhouseHelper, db, table, table_map) -> bool: logging = get_basic_utilities().get(LOGGER) buffer_table = f'{db}.{table}_Buffer' if client.is_table_exists(buffer_table): return True # noinspection PyBroadException try: # noinspection SqlDialectInspection table_schema_query = ''' SELECT create_table_query, engine_full FROM system.tables WHERE database = %(db)s AND name = %(table)s ''' result = client.execute(query=table_schema_query, params={ 'db': db, 'table': table }) schema, engine_details = result[0] schema = schema.replace(engine_details, '') schema = schema.replace(f'{db}.{table}', f'{buffer_table}') buffer = table_map['buffer'] buffer_schema = f"{schema} Buffer({db}, {table}, {buffer['num_layers']}, {buffer['min_time']}, " \ f"{buffer['max_time']}, {buffer['min_rows']}, {buffer['max_rows']}, {buffer['min_bytes']}, " \ f"{buffer['max_bytes']})" client.execute(query=buffer_schema) return True except Exception as e: logging.error(f'failed to create to {buffer_table}: {e}', exc_info=True) return False
def on_consumer_failure(consumer_task: Task, e, trace): config, mail_client = get_basic_utilities().get_utils((CONFIG, SMTP_CLIENT)) logging = get_logger() alert_config = config['alert'] logging.error(f'{consumer_task.name} consumer failed: error {e}: {trace}') mail_client.send(alert_config["sender"], alert_config["receivers"], f'CH-Sync: {consumer_task.name} consumer failed')
def get_logger(): config, logging = get_basic_utilities().get_utils((CONFIG, LOGGER)) if 'logs_path' in config['logs']: logs_path = str( pathlib.Path(config['logs']['logs_path']).joinpath('producer')) return prepare_logger(logs_path, os.getenv('env')) return logging
def on_consumer_terminate(consumer_task: Task): config, mail_client = get_basic_utilities().get_utils((CONFIG, SMTP_CLIENT)) logging = get_logger() alert_config = config['alert'] logging.error(f'{consumer_task.name} consumer terminated') mail_client.send(alert_config["sender"], alert_config["receivers"], f'CH-Sync: {consumer_task.name} consumer terminated')
def main(): utils = get_basic_utilities() config = utils.get(CONFIG) logging = get_logger() logging.info('starting the consumers') redis_config = config['redis'] # start the supported consumers supported_consumers = get_supported_consumers() max_read_fails_allowed = config['consumer']['max_read_fails_allowed'] min_up_time = config['consumer']['min_up_time'] consumer_tasks = [] for consumer in supported_consumers: task = Task(func=data_consumer, args=(consumer,), kwargs={}, name=consumer, err_call_back=on_consumer_failure, term_call_back=on_consumer_terminate, max_restarts=max_read_fails_allowed, min_up_time=min_up_time, restart_delay=config['consumer']['restart_delay'], redis=get_redis_client(redis_config['host'], redis_config['port'], redis_config['db'])) task.start() consumer_tasks.append(task) # handle abnormal termination signal.signal(signal.SIGINT, lambda *x: exit_gracefully(consumer_tasks)) signal.signal(signal.SIGTERM, lambda *x: exit_gracefully(consumer_tasks)) # wait for all tasks to finish tasks_not_active = threading.Event() check_tasks_completed(consumer_tasks, tasks_not_active) tasks_not_active.wait() logging.info('consumers finished')
def handle_consumers(consumers, option): console = Console() utils = get_basic_utilities() config, logging = utils.get_utils((CONFIG, LOGGER)) redis_config = config['redis'] redis_helper = get_singleton_redis_client(redis_config['host'], redis_config['port'], redis_config['db']) task_manager = TaskManager(redis_helper) operation = options_map[option] active_consumers = [] for consumer in consumers: status = task_manager.get_task_info(consumer) if not status or status == Status.COMPLETE.name: console.print(f'[bold red] {consumer} not running[/bold red]') else: active_consumers.append(consumer) if len(active_consumers) < 1: return if option == 'INFO': task_manager.display_info(active_consumers) if option == 'STATUS': task_manager.display_status(active_consumers) if option == Status.ACTIVE.name or option == Status.INACTIVE.name or option == Status.RESTARTING.name: for consumer in active_consumers: # noinspection PyArgumentList _ = operation(task_manager, consumer) task_manager.display_status(active_consumers) return
def delete_topics(topics, time_out=10): # noinspection PyBroadException config = get_basic_utilities().get(CONFIG) kafka_config = config['kafka'] server = f"{kafka_config['host']}:{kafka_config['port']}" admin_client = KafkaAdminClient(bootstrap_servers=server) consumer = KafkaConsumer(bootstrap_servers=server) active_topics = [topic for topic in consumer.topics() if topic in topics] admin_client.delete_topics(topics=active_topics) stop = threading.Event() all_deleted = False def is_deleted(): nonlocal all_deleted while not stop.is_set(): current_topics = consumer.topics() for topic in active_topics: if topic in current_topics: stop.wait(timeout=1) break else: stop.set() all_deleted = True thread = threading.Thread(name='delete-topics', target=is_deleted) thread.start() stop.wait(timeout=time_out) stop.set() thread.join() return all_deleted
def loader(load_all, exclude, collections, store_tick, batch_size): click.confirm('clickhouse table will be re-created with new data', abort=True) utils = get_basic_utilities() logging = utils.get(LOGGER) if not load_all and not collections: loader.error('inout not provided') return False enabled_consumers = get_supported_consumers() if load_all: all_collections = enabled_consumers collections = all_collections if exclude is None else [col for col in all_collections if col not in exclude.split(',')] else: collections = collections.split(',') for collection in collections: # noinspection PyBroadException try: load_collection_data(collection, store_tick, batch_size) except Exception: logging.error(f'clickhouse {collection} loader failed: {traceback.format_exc()}') return False return True
def get_supported_consumers(): utils = get_basic_utilities() config = utils.get(CONFIG) collections = config['producer']['sync'] exclude = config['consumer']['exclude'] return [ collection for collection in collections if collection not in exclude ]
def create_test_table(table): config = get_basic_utilities().get(CONFIG) ch_client = get_ch_client_with_dict_config(config['clickhouse']) table_map = get_table_map(table) assert table_map is not None if ch_client.is_table_exists(f'{table_map["clickhouse"]}'): return True clickhouse_table = table_map['table_create'] ch_client.execute(query=clickhouse_table) return True
def main(): utils = get_basic_utilities() config, alert = utils.get_utils((CONFIG, SMTP_CLIENT)) logging = get_logger() alert_config = config['alert'] logging.info('log-producer started') # noinspection PyBroadException try: producer() except Exception: logging.error(f"producer failed: {traceback.format_exc()}") alert.send(alert_config["sender"], alert_config["receivers"], "CH-Sync: Producer failed")
def cleanup(): yield None config = get_basic_utilities().get(CONFIG) arango_client = get_singleton_arango_client(config['arango']) collection = 'test' collection_exists = collection in arango_client.db.collections if collection_exists: arango_client.db.collections[collection].delete() del arango_client.db.collections[collection] clickhouse: ClickhouseHelper = get_singleton_ch_client( config['clickhouse']) clickhouse.drop_table_if_exists(collection) clickhouse.drop_table_if_exists(f'{collection}_Buffer')
def get_table_map(table): config = get_basic_utilities().get(CONFIG) table_config = load_schema_mapper(f'{table}.yaml') schema = { 'arango': table, 'clickhouse': table_config['table_name'], 'clickhouse_db': config['clickhouse']['database'], 'table_create': table_config['table'], 'schema': table_config['schema'] } if 'buffer' in table_config: schema['buffer'] = table_config['buffer'] if 'topic_config' in table_config: schema['topic_config'] = table_config['topic_config'] return schema
def validate_table_names(_, __, value): config = get_basic_utilities().get(CONFIG) allowed_tables = sorted(config['producer']['sync']) if value.strip() == '': tables = [] else: tables = [table.strip() for table in value.split(',')] not_allowed = [] for table in tables: if table not in allowed_tables: not_allowed.append(table) if len(not_allowed) > 0: raise click.BadParameter( 'tables {} are not allowed.\nAllowed tables:\n{}'.format( ', '.join(not_allowed), ',\n'.join(allowed_tables))) return tables
def create_topic(table): config, logging = get_basic_utilities().get_utils((CONFIG, LOGGER)) table_map = get_table_map_by_arango_collection(table) if not table_map: return False kafka_config = config['kafka'] admin_client = KafkaAdminClient( bootstrap_servers=f"{kafka_config['host']}:{kafka_config['port']}", ) # create kafka topic custom_topic_configs = table_map[ 'topic_config'] if 'topic_config' in table_map else {} topic_config = { 'name': table, 'num_partitions': 1, 'replication_factor': 1, 'topic_configs': custom_topic_configs } new_topic = NewTopic(**topic_config) admin_client.create_topics([new_topic]) logging.info(f'{table} topic created') return True
def get_supported_producers(): utils = get_basic_utilities() config = utils.get(CONFIG) collections = config['producer']['sync'] return collections
def test_get_basic_utilities(basic_utilities): get_basic_utilities() assert True
def data_consumer(consumer_name, stop_event: threading.Event): config = get_basic_utilities().get(CONFIG) logging = get_logger() logging.info(f'{consumer_name} started') # initialize necessary config kafka_config, consumer_config = config['kafka'], config['consumer'] # initialize redis redis_config = config['redis'] redis_helper = get_singleton_redis_client(redis_config['host'], redis_config['port'], redis_config['db']) initial_tick = get_initial_tick_of_consumer(redis_helper, consumer_name) # initialize clickhouse ch_client = get_ch_client_with_dict_config(config['clickhouse']) table_map = get_table_map_by_arango_collection(consumer_name) if table_map is None: logging.error('table map is not available') return False use_buffer = 'buffer' in table_map ch_table = f"{table_map['clickhouse']}_Buffer" if use_buffer else table_map['clickhouse'] primary_key = table_map['schema']['primary_key'] # create buffer table if not present if use_buffer: created = create_buffer_table(ch_client, table_map['clickhouse_db'], table_map['clickhouse'], table_map) if not created: logging.error('failed to create buffer table') return False # initialize kafka consumer consumer = custom_connect_consumer(kafka_config['host'], kafka_config['port'], consumer_name, consumer_name) idle, max_records, time_out = (consumer_config['idle'], consumer_config['kafka_max_records'], consumer_config['kafka_poll_time_out']) # noinspection PyBroadException try: while not stop_event.is_set(): logging.info(f'{consumer_name}: polling for messages') msg_pack = consumer.poll(timeout_ms=time_out, max_records=max_records) for topic, messages in msg_pack.items(): # _ = handle_messages([m.value for m in messages], consumer_name) documents = [{'offset': m.offset, 'doc': m.value} for m in messages] documents = pre_process_documents(initial_tick, documents) # set to none to skip initial tick validation if len(documents) > 0: initial_tick = None # transform the documents documents, errors = transform_documents(table_map['schema'], documents) log_error_documents(errors) # insert the documents processed_count = bulk_insert_documents(ch_client, ch_table, documents) # log the documents for document in documents: # logging.debug(f'{consumer_name}: message: {document}') logging.info(f'{consumer_name}: processed: {document[primary_key]}, ver: {document["_ver"]}') logging.info(f'{consumer_name}: processed {processed_count} docs') # update offset in kafka consumer.commit() # idle the process is_messages_consumed = all_messages_consumed(consumer) if is_messages_consumed: logging.info(f'{consumer_name} process idle') stop_event.wait(timeout=idle) except Exception as e: consumer.close() raise e logging.info(f'{consumer_name} exited gracefully')
def test_get_basic_utilities_singleton(basic_utilities): new_basic_utilities = get_basic_utilities() assert new_basic_utilities is basic_utilities
def basic_utilities(): return get_basic_utilities()
def producer(): utils = get_basic_utilities() config, alert = utils.get_utils((CONFIG, SMTP_CLIENT)) logging = get_logger() producer_config = config['producer'] arango_wal_client = get_wal_client({**config['arango'], **config['wal']}) redis_helper = get_singleton_redis_client(config['redis']['host'], config['redis']['port'], config['redis']['db']) last_tick_file = open('last-tick.txt', 'w') arango_collections = get_arango_collections(config['arango']) collections_id_dict = { collection: meta.globallyUniqueId for collection, meta in arango_collections.items() if collection in producer_config['sync'] } id_to_collection_dict = get_id_collection_map(collections_id_dict) logging.info(f'listening collections: {list(collections_id_dict.keys())}') # set last-tick as first tick during only the first start init_tick = set_tick_if_not_set(arango_wal_client, redis_helper) if init_tick: logging.info(f'stored initial tick: {init_tick}') _ = config['producer']['reader_batch'] writer_timeout = config['producer']['writer_timeout'] log_writer = get_log_writer(config['kafka']['host'], config['kafka']['port'], key_encode, json_encode)() exit_event = threading.Event() _ = Terminate(exit_event) while not exit_event.is_set(): last_tick = get_last_processed_tick(redis_helper) logging.info(f'last processed tick: {last_tick}') logs_collector = collect_logs(arango_wal_client, last_tick, None, collections_id_dict) logs_generator = LogGenerator(logs_collector) for docs in logs_generator: tick_start = docs['content'][0]['tick'] if len( docs['content']) > 0 else None if not docs['from_present']: logging.error( f'ticks lost asked for {last_tick} but got {tick_start}') # store in kafka log_writer.bulk_write( prepare_kafka_documents(id_to_collection_dict, docs, writer_timeout)) log_writer.flush() # update tick only if valid if int(docs['last_included']) > 0: if updated_last_processed_tick(redis_helper, docs['last_included']): update_file_last_tick(last_tick_file, docs['last_included']) # if is_processed set to False then the data batch will be processed again # setting False always will lead to infinite loop logs_generator.is_processed(True) logging.info( f'processed {f"{tick_start}-" if tick_start else ""}{docs["last_included"]}: ' f'overall {len(docs["content"])} docs') # handle termination call if exit_event.is_set(): break logging.info('sleeping') exit_event.wait(timeout=config['producer']['idle']) logging.info('producer terminated gracefully')
def get_redis_client(): config = get_basic_utilities().get(CONFIG) redis_helper = get_singleton_redis_client(config['redis']['host'], config['redis']['port'], config['redis']['db']) return redis_helper
def synchronizer(tables, clear): config, logging = get_basic_utilities().get_utils((CONFIG, LOGGER)) redis_config = config['redis'] redis_helper = get_singleton_redis_client(redis_config['host'], redis_config['port'], redis_config['db']) try: generate_config_file() except (FileNotFoundError, Exception) as e: logging.error(f'unable to generate pm2 config file: {e}', exc_info=True) return False pm2_config_path = get_config_path() producer_process = PM2('arango-producer', pm2_config_path) consumer_process = PM2('clickhouse-consumer', pm2_config_path) task_manager = TaskManager(redis_helper) # clear redis cache db if specified if clear: redis_helper.client.flushdb() logging.info('redis cache cleared') else: # delete consumer specific keys for table in tables: for key in redis_helper.client.keys(f'{table}*'): redis_helper.client.delete(key) # stop the producer process if not producer_process.stop(): logging.error('unable to stop producer') return False # stop the consumer process for table in tables: consumer_active = task_manager.ping(table) if consumer_active: result = task_manager.stop_task(table) if result == Status.INACTIVE.name: logging.info(f'stopped the consumer {table}') else: logging.error(f'unable to stop consumer {table}') return False else: logging.info(f'consumer {table} not active') # delete topics all_deleted = delete_topics(tables) if not all_deleted: logging.error(f'unable to delete all kafka topics') return False # create topic for table in tables: created = create_topic(table) if not created: logging.error(f'unable to delete topic: {table}') return False # start producer process if not producer_process.start(): logging.error('unable to start producer') return False # sync existing collection data for table in tables: is_data_loaded = load_collection_data(collection=table, store_tick=True, batch_size=100000) if is_data_loaded: logging.info('existing data loaded to clickhouse') else: logging.error(f'failed to load {table} data') return False # start the consumer if task_manager.ping(table): result = task_manager.start_task(table) if result == Status.ACTIVE.name: logging.info(f'{table} consumer process started') else: logging.error('unable to start consumer, restarting using pm2') if consumer_process.restart(): logging.info('pm2 consumer restarted') else: logging.error('unable to restart pm2 consumers') return False return True
def load_collection_data(collection, store_tick, batch_size): basic_utils = get_basic_utilities() config, logging, mail_client = basic_utils.get_utils((CONFIG, LOGGER, SMTP_CLIENT)) arango_client = get_singleton_arango_client(config['arango']) clickhouse: ClickhouseHelper = get_singleton_ch_client(config['clickhouse']) clickhouse_client: Client = clickhouse.client clickhouse_table_map = get_table_map_by_arango_collection(collection) clickhouse_table, clickhouse_db, clickhouse_table_schema = ( clickhouse_table_map['clickhouse'], clickhouse_table_map['clickhouse_db'], clickhouse_table_map['schema']) # prepare the tables clickhouse_temp_table = f'{clickhouse_table}Temp' clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_temp_table}') temp_table, table_created = create_temporary_table(clickhouse_client, clickhouse_table_map['table_create'], f'{clickhouse_db}.{clickhouse_table}', f'{clickhouse_db}.{clickhouse_temp_table}') logging.info(f'temporary table created for {clickhouse_table}') # store current tick for the table in redis if store_tick: wal_client = get_wal_client({**config['arango'], **config['wal']}) last_tick = wal_client.get_last_tick() redis_config = config['redis'] redis_helper = get_singleton_redis_client(redis_config['host'], redis_config['port'], redis_config['db']) redis_helper.client.set(f'{collection}:last-tick', last_tick['tick']) logging.info(f'stored current wal tick: {last_tick}') logging.info('collect documents from arango') processed_documents = 0 errors = 0 for documents in get_all_documents(db_client=arango_client, col_name=collection, batch_size=batch_size): logging.info(f'documents collected fom arango: {len(documents)} docs') # map the documents from arango to clickhouse document for i in range(len(documents)): try: documents[i] = convert_to_ch_dict_using_schema(clickhouse_table_schema, documents[i]) except (TypeError, ValueError, KeyError): logging.document(f'doc: {documents[i]}') logging.document(f'error: {traceback.format_exc()}') documents[i] = None errors += 1 # filter invalid documents documents = [doc for doc in documents if doc is not None] if len(documents) > 0: total_insertion = clickhouse.bulk_dict_doc_insert(documents, temp_table, list(documents[0].keys()), batch_size) logging.info(f'populated data on clickhouse: {total_insertion} docs') processed_documents += total_insertion logging.info(f'overall processed documents: {processed_documents} docs') logging.info('data populated on temporary table') clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_table}') logging.info(f'dropped table {clickhouse_table}') clickhouse.rename_table(temp_table, f'{clickhouse_db}.{clickhouse_table}') logging.info('table populated successfully') logging.info(f'Incompatible documents: {errors}') # prepare buffer table if 'buffer' in clickhouse_table_map: clickhouse_buffer = f'{clickhouse_table}_Buffer' clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_buffer}') logging.info(f'dropped table {clickhouse_buffer}') create_buffer_table(clickhouse, clickhouse_db, clickhouse_table, clickhouse_table_map) logging.info('buffer table created successfully') return True
def get_test_table(): config = get_basic_utilities().get(CONFIG) arango_client = get_singleton_arango_client(config['arango']) if 'test' in arango_client.db.collections: return arango_client.db.collections['test'] return arango_client.db.createCollection(name='test')