def generated_chunked_parallelized_results( self, partially_bound_function, tasks, n_processes, chunksize=1, ): with Pool(n_processes, maxtasksperchild=1) as pool: for result in pool.map( partially_bound_function, [list(task_batch) for task_batch in Batch(tasks, chunksize)]): yield result
def process_query_tasks(self, query_tasks): """Run queries by table Will run preparation (e.g. create table) and finalize (e.g. create index) tasks in the main process, but delegate inserts to rq Jobs in batches of 25 Args: query_tasks (dict) - keys should be table names and values should be dicts. Each inner dict should have up to three keys, each with a list of queries: 'prepare' (setting up the table), 'inserts' (insert commands to populate the table), 'finalize' (finishing table setup after all inserts have run) Example: { 'table_one': { 'prepare': ['create table table_one (col1 varchar)'], 'inserts': [ 'insert into table_one values (\'a\')', 'insert into table_one values (\'b'\')' ] 'finalize': ['create index on table_one (col1)'] } } """ for table_name, tasks in query_tasks.items(): logger.spam(f"Processing features for {table_name}") self.feature_generator.run_commands(tasks.get("prepare", [])) insert_batches = [ list(task_batch) for task_batch in Batch(tasks.get("inserts", []), 25) ] jobs = [ self.queue.enqueue( self.feature_generator.run_commands, insert_batch, job_timeout=DEFAULT_TIMEOUT, result_ttl=DEFAULT_TIMEOUT, ttl=DEFAULT_TIMEOUT, ) for insert_batch in insert_batches ] self.wait_for(jobs) self.feature_generator.run_commands(tasks.get("finalize", [])) logger.debug(f"{table_name} completed")
def process_query_tasks(self, query_tasks): logging.info("Processing query tasks with %s processes", self.n_db_processes) for table_name, tasks in query_tasks.items(): logging.info("Processing features for %s", table_name) self.feature_generator.run_commands(tasks.get("prepare", [])) partial_insert = partial(insert_into_table, feature_generator=self.feature_generator) insert_batches = [ list(task_batch) for task_batch in Batch(tasks.get("inserts", []), 25) ] parallelize(partial_insert, insert_batches, n_processes=self.n_db_processes) self.feature_generator.run_commands(tasks.get("finalize", [])) logging.info("%s completed", table_name)
def generated_chunked_parallelized_results(self, partially_bound_function, tasks, n_processes, chunksize=1): with ProcessPool(n_processes, max_tasks=1) as pool: future = pool.map( partially_bound_function, [list(task_batch) for task_batch in Batch(tasks, chunksize)], ) iterator = future.result() while True: try: yield next(iterator) except StopIteration: break except Exception: logging.exception('Child failure')