def aggregate_raw_data( data_dir, results_dir, plugin_dir, plugin, start_date=None, end_date=None, events_limit=0, worker_num=3 * multiprocessing.cpu_count() / 4): """ Workers-aggregator subpipeline: 0. Load worker, aggregator classes from a specified plugin 1. Run workers in parallel (basing on server stats files) 2. Accumulate results by an aggregator 3. Run aggregator post processing """ setup_logs() logger = multiprocessing.get_logger() pool = multiprocessing.Pool(worker_num) try: files = [ os.path.join(data_dir, fname) for fname in os.listdir(data_dir) if check_fname(fname, start_date, end_date) ] items = ( (plugin_dir, plugin, fpath, events_limit) for fpath in files ) aggregator = load_plugin( plugin, plugin_dir=plugin_dir ).DataAggregator(results_dir) logger.info('Aggregator: aggregate') for i, results in enumerate(pool.imap(invoke_cmd_worker, items)): try: aggregator.aggregate(WorkerResults.loads_object(results)) except Exception: logger.error( 'Aggregator: processing of %s failed: %s' % ( files[i], traceback.format_exc() ) ) logger.info('Aggregator: post_aggregate') aggregator.post_aggregate(pool) logger.info('Aggregator: done') finally: pool.terminate() pool.join() return aggregator
def aggregate_raw_data(data_dir, results_dir, plugin_dir, plugin, start_date=None, end_date=None, events_limit=0, worker_num=3 * multiprocessing.cpu_count() / 4): """ Workers-aggregator subpipeline: 0. Load worker, aggregator classes from a specified plugin 1. Run workers in parallel (basing on server stats files) 2. Accumulate results by an aggregator 3. Run aggregator post processing """ setup_logs() logger = multiprocessing.get_logger() pool = multiprocessing.Pool(worker_num) try: files = [ os.path.join(data_dir, fname) for fname in os.listdir(data_dir) if check_fname(fname, start_date, end_date) ] items = ((plugin_dir, plugin, fpath, events_limit) for fpath in files) aggregator = load_plugin( plugin, plugin_dir=plugin_dir).DataAggregator(results_dir) logger.info('Aggregator: aggregate') for i, results in enumerate(pool.imap(invoke_cmd_worker, items)): try: aggregator.aggregate(WorkerResults.loads_object(results)) except Exception: logger.error('Aggregator: processing of %s failed: %s' % (files[i], traceback.format_exc())) logger.info('Aggregator: post_aggregate') aggregator.post_aggregate(pool) logger.info('Aggregator: done') finally: pool.terminate() pool.join() return aggregator
def aggregate_raw_data(data_dir, results_dir, plugin_dir, plugin, start_date=None, end_date=None, events_limit=0, worker_num=3 * multiprocessing.cpu_count() / 4): """ Workers-aggregator subpipeline: 0. Load worker, aggregator classes from a specified plugin 1. Run workers in parallel (basing on server stats files) 2. Accumulate results by an aggregator 3. Run aggregator post processing """ setup_logs() logger = multiprocessing.get_logger() pool = multiprocessing.Pool(worker_num) try: items = ((plugin_dir, plugin, os.path.join(data_dir, fname), events_limit) for fname in os.listdir(data_dir) if check_fname(fname, start_date, end_date)) aggregator = load_plugin( plugin, plugin_dir=plugin_dir).DataAggregator(results_dir) logger.info('Aggregator: aggregate') for results in pool.imap_unordered(invoke_cmd_worker, items): aggregator.aggregate(WorkerResults.loads_object(results)) logger.info('Aggregator: post_aggregate') aggregator.post_aggregate(pool) logger.info('Aggregator: done') finally: pool.terminate() pool.join() return aggregator
def aggregate_raw_data( data_dir, results_dir, plugin_dir, plugin, start_date=None, end_date=None, events_limit=0, worker_num=3 * multiprocessing.cpu_count() / 4): """ Workers-aggregator subpipeline: 0. Load worker, aggregator classes from a specified plugin 1. Run workers in parallel (basing on server stats files) 2. Accumulate results by an aggregator 3. Run aggregator post processing """ setup_logs() logger = multiprocessing.get_logger() pool = multiprocessing.Pool(worker_num) try: items = ( (plugin_dir, plugin, os.path.join(data_dir, fname), events_limit) for fname in os.listdir(data_dir) if check_fname(fname, start_date, end_date) ) aggregator = load_plugin( plugin, plugin_dir=plugin_dir ).DataAggregator(results_dir) logger.info('Aggregator: aggregate') for results in pool.imap_unordered(invoke_cmd_worker, items): aggregator.aggregate(WorkerResults.loads_object(results)) logger.info('Aggregator: post_aggregate') aggregator.post_aggregate(pool) logger.info('Aggregator: done') finally: pool.terminate() pool.join() return aggregator
def aggregate_raw_data(data_dir, results_dir, plugin_dir, plugin, start_date=None, end_date=None, events_limit=0, worker_num=DEFAULT_WORKER_NUM): """Workers-aggregator subpipeline. 0. Load worker, aggregator classes from a specified plugin 1. Run workers in parallel (basing on server stats files) 2. Accumulate results by an aggregator 3. Run aggregator post processing """ setup_logs() logger = multiprocessing.get_logger() files = [ os.path.join(data_dir, fname) for fname in sorted(os.listdir(data_dir)) if check_fname(fname, start_date, end_date) ] tasks = [(plugin_dir, plugin, fpath, events_limit) for fpath in files] aggregator = load_plugin(plugin, plugin_dir=plugin_dir).DataAggregator(results_dir) logger.info('Aggregator: start workers') # Let us create pools before main process will consume more memory # and let workers live forever (default) to exclude spontaneous forking worker_pool = multiprocessing.Pool(worker_num) # Just to be 100% safe we have no leaks, let us use separate pools # for work-aggregate phase and post aggregation. # Also, most of the post aggregation tasks heavily depend on the disk IO # so we do not need so much workers. post_aggregator_pool = multiprocessing.Pool(worker_num // 2) try: engine = worker_pool.imap_unordered batch_size = 2 * worker_num batch_number = len(tasks) // batch_size + 1 for batch_no in range(batch_number): batch_start = batch_no * batch_size batch_tasks = tasks[batch_start:batch_start + batch_size] logger.info('Aggregator: batch %d is being aggregated: %s' % (batch_no, batch_tasks)) for file_name, results in engine(invoke_cmd_worker, batch_tasks): try: results = WorkerResults.loads_object(results) logger.info('Aggregator: task %s is being aggregated' % file_name) aggregator.aggregate(results) logger.info('Aggregator: task %s done' % file_name) except Exception as e: logger.exception('Aggregator: task %s failed:\n%s', file_name, e) finally: worker_pool.terminate() worker_pool.join() logger.info('Aggregator: post_aggregate') try: aggregator.post_aggregate(pool=post_aggregator_pool) finally: post_aggregator_pool.terminate() post_aggregator_pool.join() logger.info('Aggregator: done') return aggregator