def cleanup(): yield None config = get_basic_utilities().get(CONFIG) arango_client = get_singleton_arango_client(config['arango']) collection = 'test' collection_exists = collection in arango_client.db.collections if collection_exists: arango_client.db.collections[collection].delete() del arango_client.db.collections[collection] clickhouse: ClickhouseHelper = get_singleton_ch_client( config['clickhouse']) clickhouse.drop_table_if_exists(collection) clickhouse.drop_table_if_exists(f'{collection}_Buffer')
def get_test_table(): config = get_basic_utilities().get(CONFIG) arango_client = get_singleton_arango_client(config['arango']) if 'test' in arango_client.db.collections: return arango_client.db.collections['test'] return arango_client.db.createCollection(name='test')
def test_arango_connection(basic_utilities): config = basic_utilities.get(CONFIG) get_singleton_arango_client(config['arango']) return True
def test_get_singleton_arango_client(basic_utilities): config = basic_utilities.get(CONFIG) arango_client = get_singleton_arango_client(config['arango']) new_arango_client = get_singleton_arango_client(config['arango']) return arango_client is new_arango_client
def get_arango_collections(arango_config): arango_db_client = get_singleton_arango_client(arango_config) return arango_db_client.db.collections
def load_collection_data(collection, store_tick, batch_size): basic_utils = get_basic_utilities() config, logging, mail_client = basic_utils.get_utils((CONFIG, LOGGER, SMTP_CLIENT)) arango_client = get_singleton_arango_client(config['arango']) clickhouse: ClickhouseHelper = get_singleton_ch_client(config['clickhouse']) clickhouse_client: Client = clickhouse.client clickhouse_table_map = get_table_map_by_arango_collection(collection) clickhouse_table, clickhouse_db, clickhouse_table_schema = ( clickhouse_table_map['clickhouse'], clickhouse_table_map['clickhouse_db'], clickhouse_table_map['schema']) # prepare the tables clickhouse_temp_table = f'{clickhouse_table}Temp' clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_temp_table}') temp_table, table_created = create_temporary_table(clickhouse_client, clickhouse_table_map['table_create'], f'{clickhouse_db}.{clickhouse_table}', f'{clickhouse_db}.{clickhouse_temp_table}') logging.info(f'temporary table created for {clickhouse_table}') # store current tick for the table in redis if store_tick: wal_client = get_wal_client({**config['arango'], **config['wal']}) last_tick = wal_client.get_last_tick() redis_config = config['redis'] redis_helper = get_singleton_redis_client(redis_config['host'], redis_config['port'], redis_config['db']) redis_helper.client.set(f'{collection}:last-tick', last_tick['tick']) logging.info(f'stored current wal tick: {last_tick}') logging.info('collect documents from arango') processed_documents = 0 errors = 0 for documents in get_all_documents(db_client=arango_client, col_name=collection, batch_size=batch_size): logging.info(f'documents collected fom arango: {len(documents)} docs') # map the documents from arango to clickhouse document for i in range(len(documents)): try: documents[i] = convert_to_ch_dict_using_schema(clickhouse_table_schema, documents[i]) except (TypeError, ValueError, KeyError): logging.document(f'doc: {documents[i]}') logging.document(f'error: {traceback.format_exc()}') documents[i] = None errors += 1 # filter invalid documents documents = [doc for doc in documents if doc is not None] if len(documents) > 0: total_insertion = clickhouse.bulk_dict_doc_insert(documents, temp_table, list(documents[0].keys()), batch_size) logging.info(f'populated data on clickhouse: {total_insertion} docs') processed_documents += total_insertion logging.info(f'overall processed documents: {processed_documents} docs') logging.info('data populated on temporary table') clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_table}') logging.info(f'dropped table {clickhouse_table}') clickhouse.rename_table(temp_table, f'{clickhouse_db}.{clickhouse_table}') logging.info('table populated successfully') logging.info(f'Incompatible documents: {errors}') # prepare buffer table if 'buffer' in clickhouse_table_map: clickhouse_buffer = f'{clickhouse_table}_Buffer' clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_buffer}') logging.info(f'dropped table {clickhouse_buffer}') create_buffer_table(clickhouse, clickhouse_db, clickhouse_table, clickhouse_table_map) logging.info('buffer table created successfully') return True