def bootstrap_within_guides(s, n_cells=100, n_reps=10000, statistic=np.mean, n_jobs=1, tqdm=False): rng = np.random.default_rng() guide_values = {k: g.values for k, g in s.groupby('sgRNA')} guides = list(guide_values) if tqdm: reps = tqdm_auto(range(n_reps)) else: reps = range(n_reps) def bootstrap(guide_values, guides, n_cells, statistic): rep_guide = rng.choice(guides) vals = guide_values[rep_guide] return statistic(vals[rng.integers(len(vals), size=n_cells)]) if n_jobs != 1: bootstrapped = Parallel(n_jobs=n_jobs)( delayed(bootstrap)(guide_values, guides, n_cells, statistic) for _ in reps) else: bootstrapped = [ bootstrap(guide_values, guides, n_cells, statistic) for _ in reps ] return np.array(bootstrapped)
def format_save_pdf_documents(self, output_folder: dataiku.Folder, output_df: pd.DataFrame) -> Tuple[int, int]: """Open PDF documents in a `dataiku.Folder`, draw text bounding polygons and save them to another folder""" df_iterator = (index_series_pair[1].to_dict() for index_series_pair in output_df.iterrows()) len_iterator = len(output_df.index) api_results = [] start = perf_counter() logging.info(f"Formatting and saving {len_iterator} PDF page(s) to output folder...") with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.format_save_pdf_document, output_folder=output_folder, pdf_path=row[self.doc_handler.SPLITTED_PATH_COLUMN], response=safe_json_loads(row[self.api_column_names.response]), ) for row in df_iterator ] for future in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(future.result()) num_success = sum(api_results) num_error = len(api_results) - num_success logging.info( ( f"Formatting and saving {len_iterator} PDF page(s) to output folder: " f"{num_success} succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds." ) ) return (num_success, num_error)
def tqdm(*args, disable=TqdmDisableOption.default, **kwargs): from tqdm.notebook import tqdm as tqdm_notebook # pylint: disable=import-outside-toplevel from tqdm.auto import tqdm as tqdm_auto # pylint: disable=import-outside-toplevel # To tqdm_notebook, None means do not display. To standard tqdm, None means # display only when connected to a TTY. if disable == TqdmDisableOption.default: disable = False if tqdm_auto == tqdm_notebook else None return tqdm_auto(*args, disable=disable, **kwargs)
def split_all_documents( self, path_df: pd.DataFrame, input_folder: dataiku.Folder, output_folder: dataiku.Folder, path_column: AnyStr = PATH_COLUMN, ) -> pd.DataFrame: """Split several PDF or TIFF document files into multiple pages and save them as files in another folder Args: path_df: DataFrame with one column named `path_column` with all the PDF/TIFF file paths input_folder: `dataiku.Folder` where the input PDF/TIFF files is stored output_folder: `dataiku.Folder` where files will be saved path_column: Name of the path column in the input dataframe Returns: DataFrame with two columns: 1. `path_column`: Paths of the input documents 2. `self.SPLITTED_PATH_COLUMN`: Paths of the splitted files """ start = perf_counter() logging.info( f"Splitting {len(path_df.index)} document(s) and saving each page to output folder..." ) results = [] with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit(self.split_document, input_folder=input_folder, output_folder=output_folder, input_path=input_path) for input_path in path_df[path_column] ] for future in tqdm_auto(as_completed(futures), total=len(path_df.index)): results.append(future.result()) num_success = sum( [result[self.OUTPUT_PATH_LIST_KEY][0] != "" for result in results]) num_error = len(results) - num_success num_pages = sum( [len(result[self.OUTPUT_PATH_LIST_KEY]) for result in results]) - num_error if num_pages == 0: raise DocumentSplitError("Could not split any document") logging.info(( f"Splitting {len(path_df.index)} document(s) and saving each page to output folder: " f"{num_success} document(s) succeeded generating {num_pages} page(s), " f"{num_error} document(s) failed in {(perf_counter() - start):.2f} seconds." )) output_df = pd.DataFrame([ OrderedDict([(path_column, result[self.INPUT_PATH_KEY]), (self.SPLITTED_PATH_COLUMN, output_path)]) for result in results for output_path in result[self.OUTPUT_PATH_LIST_KEY] ]) return output_df
def merge_all_documents( self, path_df: pd.DataFrame, input_folder: dataiku.Folder, output_folder: dataiku.Folder, path_column: AnyStr = PATH_COLUMN, ) -> pd.DataFrame: """Merge several PDF or TIFF documents after splitting by `self.split_all_documents` Bring balance to the force. Args: path_df: DataFrame with two columns - cf. output of `self.split_all_documents` 1. `path_column`: Paths of the input documents 2. `self.SPLITTED_PATH_COLUMN`: Paths of the splitted files input_folder: `dataiku.Folder` where the input PDF/TIFF files are stored output_folder: `dataiku.Folder` where the merged PDF/TIFF file will be saved path_column: Name of the path column in the input dataframe """ output_df_list = path_df.groupby(path_column)[ self.SPLITTED_PATH_COLUMN].apply(list).reset_index() start = perf_counter() logging.info( f"Merging and saving {len(path_df.index)} page(s) of {len(output_df_list.index)} document(s)..." ) results = [] with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.merge_document, input_folder=input_folder, output_folder=output_folder, input_path_list=row[1], output_path=row[0], ) for row in output_df_list.itertuples(index=False) ] for future in tqdm_auto(as_completed(futures), total=len(output_df_list.index)): results.append(future.result()) num_success = sum( [1 if output_path != "" else 0 for output_path in results]) num_error = len(results) - num_success logging.info(( f"Merging and saving {len(path_df.index)} page(s) of {len(output_df_list.index)} document(s)... " f"{num_success} document(s) succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds." )) page_numbers = path_df[self.SPLITTED_PATH_COLUMN].astype(str).apply( self.extract_page_number_from_path) path_df.insert(loc=1, column=self.PAGE_NUMBER_COLUMN, value=page_numbers) del path_df[self.SPLITTED_PATH_COLUMN] return path_df
def format_save_images( self, output_folder: dataiku.Folder, output_df: pd.DataFrame = None, path_column: AnyStr = PATH_COLUMN, verbose: bool = True, ) -> Tuple[int, int]: """Generic method to apply `self.format_save_image` to all images using an `output_df` with API responses Do not override this method! """ if output_df is None: output_df = self.output_df df_iterator = (index_series_pair[1].to_dict() for index_series_pair in output_df.iterrows()) len_iterator = len(output_df.index) if verbose: logging.info( f"Formatting and saving {len_iterator} image(s) to output folder..." ) start = perf_counter() api_results = [] with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.format_save_image, output_folder=output_folder, image_path=row[path_column], response=safe_json_loads( row[self.api_column_names.response]), ) for row in df_iterator ] for future in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(future.result()) num_success = sum(api_results) num_error = len(api_results) - num_success if verbose: logging.info(( f"Formatting and saving {len_iterator} image(s) to output folder: " f"{num_success} image(s) succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds." )) return (num_success, num_error)
def _update_state(self, task_id, task_state): if task_id not in self.progresses: total = task_state.total_block_count self.progresses[task_id] = tqdm_auto(total=total, desc=task_id + " ▶", unit='blocks', leave=True) self.progresses[task_id].set_postfix({ '⧗': task_state.pending_count, '▶': task_state.processing_count, '✔': task_state.completed_count, '✗': task_state.failed_count, '∅': task_state.orphaned_count }) completed = task_state.completed_count delta = completed - self.progresses[task_id].n self.progresses[task_id].update(delta)
def make_progress_bar(*args, **kwargs): """Create iterable as progress bar if available. Ensure simple loop is returned or tqdm_notebook progress bar when prerequisities met Returns ------- iterable or tqdm_notebook tqdm_notebook based progress bar or simple iterable """ pbar = args[0] try: from tqdm.auto import tqdm as tqdm_auto pbar = tqdm_auto(*args, **kwargs) except Exception as e: logging.warning( "No prerequisites (tqdm) installed for interactive progress bar, " "continuing without one. See the output in the console " "or check installation instructions." f"{e}") return pbar
def bootstrap_cells(s, n_cells=100, n_reps=10000, statistic=np.mean, n_jobs=1, tqdm=False): rng = np.random.default_rng() vals = s.values def bootstrap(vals, n_cells, statistic): return statistic(vals[rng.integers(len(vals), size=n_cells)]) if tqdm: reps = tqdm_auto(range(n_reps)) else: reps = range(n_reps) if n_job != 1: bootstrapped = Parallel(n_jobs=n_jobs)( delayed(bootstrap)(vals, n_cells, statistic) for _ in reps) else: bootstrapped = [bootstrap(vals, n_cells, statistic) for _ in reps] return np.array(bootstrapped)
def format_save_images(self, output_folder: dataiku.Folder): partition = output_folder.writePartition if output_folder.writePartition else "" output_folder.clear_partition(partition) df_iterator = (i[1].to_dict() for i in self.output_df.iterrows()) len_iterator = len(self.output_df.index) logging.info("Saving bounding boxes to output folder...") api_results = [] with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.format_save_image, output_folder=output_folder, image_path=row[IMAGE_PATH_COLUMN], response=safe_json_loads( row[self.api_column_names.response]), ) for row in df_iterator ] for f in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(f.result()) num_success = sum(api_results) num_error = len(api_results) - num_success logging.info( "Saving bounding boxes to output folder: {} images succeeded, {} failed" .format(num_success, num_error))
def progress_bar_factory(num_samples, num_chains): """Factory that builds a progress bar decorator along with the `set_tqdm_description` and `close_tqdm` functions """ if num_samples > 20: print_rate = int(num_samples / 20) else: print_rate = 1 remainder = num_samples % print_rate tqdm_bars = {} finished_chains = [] for chain in range(num_chains): tqdm_bars[chain] = tqdm_auto(range(num_samples), position=chain) tqdm_bars[chain].set_description( "Compiling.. ", refresh=True, ) def _update_tqdm(arg, transform, device): chain = int(str(device)[4:]) tqdm_bars[chain].set_description( f"Running chain {chain}", refresh=False, ) tqdm_bars[chain].update(arg) def _close_tqdm(arg, transform, device): chain = int(str(device)[4:]) tqdm_bars[chain].update(arg) finished_chains.append(chain) if len(finished_chains) == num_chains: for chain in range(num_chains): tqdm_bars[chain].close() def _update_progress_bar(iter_num): """Updates tqdm progress bar of a JAX loop only if the iteration number is a multiple of the print_rate Usage: carry = progress_bar((iter_num, print_rate), carry) """ _ = lax.cond( iter_num == 1, lambda _: host_callback.id_tap( _update_tqdm, 0, result=iter_num, tap_with_device=True), lambda _: iter_num, operand=None, ) _ = lax.cond( iter_num % print_rate == 0, lambda _: host_callback.id_tap(_update_tqdm, print_rate, result=iter_num, tap_with_device=True), lambda _: iter_num, operand=None, ) _ = lax.cond( iter_num == num_samples, lambda _: host_callback.id_tap( _close_tqdm, remainder, result=iter_num, tap_with_device=True), lambda _: iter_num, operand=None, ) def progress_bar_fori_loop(func): """Decorator that adds a progress bar to `body_fun` used in `lax.fori_loop`. Note that `body_fun` must be looping over a tuple who's first element is `np.arange(num_samples)`. This means that `iter_num` is the current iteration number """ def wrapper_progress_bar(i, vals): result = func(i, vals) _update_progress_bar(i + 1) return result return wrapper_progress_bar return progress_bar_fori_loop
def api_parallelizer(input_df: pd.DataFrame, api_call_function: Callable, api_exceptions: Union[Exception, Tuple[Exception]], column_prefix: AnyStr, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, api_support_batch: bool = DEFAULT_API_SUPPORT_BATCH, batch_size: int = DEFAULT_BATCH_SIZE, error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, verbose: bool = DEFAULT_VERBOSE, **api_call_function_kwargs) -> pd.DataFrame: """ Apply an API call function in parallel to a pandas.DataFrame. The DataFrame is passed to the function as row dictionaries. Parallelism works by: - (default) sending multiple concurrent threads - if the API supports it, sending batches of row """ df_iterator = (i[1].to_dict() for i in input_df.iterrows()) len_iterator = len(input_df.index) log_msg = "Calling remote API endpoint with {} rows...".format( len_iterator) if api_support_batch: log_msg += ", chunked by {}".format(batch_size) df_iterator = chunked(df_iterator, batch_size) len_iterator = math.ceil(len_iterator / batch_size) logging.info(log_msg) api_column_names = build_unique_column_names(input_df.columns, column_prefix) pool_kwargs = api_call_function_kwargs.copy() more_kwargs = [ "api_call_function", "error_handling", "api_exceptions", "api_column_names", ] for k in more_kwargs: pool_kwargs[k] = locals()[k] for k in ["fn", "row", "batch"]: # Reserved pool keyword arguments pool_kwargs.pop(k, None) api_results = [] with ThreadPoolExecutor(max_workers=parallel_workers) as pool: if api_support_batch: futures = [ pool.submit(api_call_batch, batch=batch, **pool_kwargs) for batch in df_iterator ] else: futures = [ pool.submit(api_call_single_row, row=row, **pool_kwargs) for row in df_iterator ] for f in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(f.result()) if api_support_batch: api_results = flatten(api_results) output_df = convert_api_results_to_df(input_df, api_results, api_column_names, error_handling, verbose) num_api_error = sum(output_df[api_column_names.response] == "") num_api_success = len(input_df.index) - num_api_error logging.info( "Remote API call results: {} rows succeeded, {} rows failed.".format( num_api_success, num_api_error)) return output_df
def tqdm(*args, disable=TQDM_DEFAULT_DISABLE, **kwargs): return tqdm_auto(*args, disable=disable, **kwargs)
def parallelizer( input_df: pd.DataFrame, function: Callable, exceptions: Union[Exception, Tuple[Exception]], column_prefix: AnyStr, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, batch_support: bool = DEFAULT_BATCH_SUPPORT, batch_size: int = DEFAULT_BATCH_SIZE, error_handling: ErrorHandling = ErrorHandling.LOG, verbose: bool = DEFAULT_VERBOSE, **function_kwargs, ) -> pd.DataFrame: """Apply a function to a pandas.DataFrame with parallelization, batching, error handling and progress tracking The DataFrame is iterated on and passed to the function as dictionaries, row-by-row or by batches of rows. This iterative process is accelerated by the use of concurrent threads and is tracked with a progress bar. Errors are catched if they match the `exceptions` parameter and automatically logged. Once the whole DataFrame has been iterated on, results and errors are added as additional columns. Args: input_df: Input dataframe which will be iterated on function: Function taking a dict as input and returning a dict If `function_support_batch` then the function works on list of dict For instance, a function to call an API or do some enrichment exceptions: Tuple of Exception classes to catch column_prefix: Column prefix to add to the output columns for the `function` responses and errors parallel_workers: Number of concurrent threads batch_support: If True, send batches of row to the `function` Else (default) send rows as dict to the function batch_size: Number of rows to include in each batch Taken into account if `batch_support` is True error_handling: If ErrorHandling.LOG (default), log the error message as a warning and return the row with error keys. Else fail is there is any error. verbose: If True, log additional information on errors Else (default) log the error message and the error type **function_kwargs: Arbitrary keyword arguments passed to the `function` Returns: Input dataframe with additional columns: - response from the `function` - error message if any - error type if any """ df_iterator = (index_series_pair[1].to_dict() for index_series_pair in input_df.iterrows()) len_iterator = len(input_df.index) start = perf_counter() if batch_support: logging.info( f"Applying function {function.__name__} in parallel to {len_iterator} row(s)" + f" using batch size of {batch_size}...") df_iterator = chunked(df_iterator, batch_size) len_iterator = math.ceil(len_iterator / batch_size) else: logging.info( f"Applying function {function.__name__} in parallel to {len_iterator} row(s)..." ) column_names = build_unique_column_names(input_df.columns, column_prefix) pool_kwargs = { **{ "function": function, "error_handling": error_handling, "exceptions": exceptions, "column_names": column_names, }, **function_kwargs.copy(), } for kwarg in ["fn", "row", "batch"]: # Reserved pool keyword arguments pool_kwargs.pop(kwarg, None) if not batch_support and "batch_response_parser" in pool_kwargs.keys(): pool_kwargs.pop("batch_response_parser", None) results = [] with ThreadPoolExecutor(max_workers=parallel_workers) as pool: if batch_support: futures = [ pool.submit(apply_function_to_batch, batch=batch, **pool_kwargs) for batch in df_iterator ] else: futures = [ pool.submit(apply_function_to_row, row=row, **pool_kwargs) for row in df_iterator ] for future in tqdm_auto(as_completed(futures), total=len_iterator): results.append(future.result()) if batch_support: results = flatten(results) output_df = convert_results_to_df(input_df, results, column_names, error_handling, verbose) num_error = sum(output_df[column_names.response] == "") num_success = len(input_df.index) - num_error logging.info(( f"Applying function in parallel: {num_success} row(s) succeeded, {num_error} failed " f"in {(perf_counter() - start):.2f} seconds.")) return output_df