def __init__(
     self,
     input_df: pd.DataFrame,
     column_prefix: AnyStr = "api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
 ):
     self.input_df = input_df
     self.column_prefix = column_prefix
     self.error_handling = error_handling
     self.api_column_names = build_unique_column_names(input_df, column_prefix)
     self.column_description_dict = {
         v: API_COLUMN_NAMES_DESCRIPTION_DICT[k] for k, v in self.api_column_names._asdict().items()
     }
 def __init__(
     self,
     input_df: pd.DataFrame,
     input_folder: dataiku.Folder = None,
     column_prefix: AnyStr = "api",
     error_handling: ErrorHandling = ErrorHandling.LOG,
     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
     **kwargs,
 ):
     store_attr()
     self.output_df = None  # initialization before calling format_df
     self.api_column_names = build_unique_column_names(
         input_df.keys(), column_prefix)
     self.column_description_dict = {
         column_name: API_COLUMN_NAMES_DESCRIPTION_DICT[key]
         for key, column_name in self.api_column_names._asdict().items()
     }
     self.column_description_dict[
         PATH_COLUMN] = "Path of the file relative to the input folder"
 def __init__(
     self,
     input_df: pd.DataFrame,
     input_folder: dataiku.Folder = None,
     column_prefix: AnyStr = "api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
 ):
     self.input_df = input_df
     self.input_folder = input_folder
     self.output_df = None  # initialization before calling format_df
     self.column_prefix = column_prefix
     self.error_handling = error_handling
     self.parallel_workers = parallel_workers
     self.api_column_names = build_unique_column_names(
         input_df.keys(), column_prefix)
     self.column_description_dict = {
         v: API_COLUMN_NAMES_DESCRIPTION_DICT[k]
         for k, v in self.api_column_names._asdict().items()
     }
def api_parallelizer(input_df: pd.DataFrame,
                     api_call_function: Callable,
                     api_exceptions: Union[Exception, Tuple[Exception]],
                     column_prefix: AnyStr,
                     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
                     api_support_batch: bool = DEFAULT_API_SUPPORT_BATCH,
                     batch_size: int = DEFAULT_BATCH_SIZE,
                     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
                     verbose: bool = DEFAULT_VERBOSE,
                     **api_call_function_kwargs) -> pd.DataFrame:
    """
    Apply an API call function in parallel to a pandas.DataFrame.
    The DataFrame is passed to the function as row dictionaries.
    Parallelism works by:
    - (default) sending multiple concurrent threads
    - if the API supports it, sending batches of row
    """
    df_iterator = (i[1].to_dict() for i in input_df.iterrows())
    len_iterator = len(input_df.index)
    log_msg = "Calling remote API endpoint with {} rows...".format(
        len_iterator)
    if api_support_batch:
        log_msg += ", chunked by {}".format(batch_size)
        df_iterator = chunked(df_iterator, batch_size)
        len_iterator = math.ceil(len_iterator / batch_size)
    logging.info(log_msg)
    api_column_names = build_unique_column_names(input_df.columns,
                                                 column_prefix)
    pool_kwargs = api_call_function_kwargs.copy()
    more_kwargs = [
        "api_call_function",
        "error_handling",
        "api_exceptions",
        "api_column_names",
    ]
    for k in more_kwargs:
        pool_kwargs[k] = locals()[k]
    for k in ["fn", "row", "batch"]:  # Reserved pool keyword arguments
        pool_kwargs.pop(k, None)
    api_results = []
    with ThreadPoolExecutor(max_workers=parallel_workers) as pool:
        if api_support_batch:
            futures = [
                pool.submit(api_call_batch, batch=batch, **pool_kwargs)
                for batch in df_iterator
            ]
        else:
            futures = [
                pool.submit(api_call_single_row, row=row, **pool_kwargs)
                for row in df_iterator
            ]
        for f in tqdm_auto(as_completed(futures), total=len_iterator):
            api_results.append(f.result())
    if api_support_batch:
        api_results = flatten(api_results)
    output_df = convert_api_results_to_df(input_df, api_results,
                                          api_column_names, error_handling,
                                          verbose)
    num_api_error = sum(output_df[api_column_names.response] == "")
    num_api_success = len(input_df.index) - num_api_error
    logging.info(
        "Remote API call results: {} rows succeeded, {} rows failed.".format(
            num_api_success, num_api_error))
    return output_df
Exemplo n.º 5
0
def parallelizer(
    input_df: pd.DataFrame,
    function: Callable,
    exceptions: Union[Exception, Tuple[Exception]],
    column_prefix: AnyStr,
    parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
    batch_support: bool = DEFAULT_BATCH_SUPPORT,
    batch_size: int = DEFAULT_BATCH_SIZE,
    error_handling: ErrorHandling = ErrorHandling.LOG,
    verbose: bool = DEFAULT_VERBOSE,
    **function_kwargs,
) -> pd.DataFrame:
    """Apply a function to a pandas.DataFrame with parallelization, batching, error handling and progress tracking

    The DataFrame is iterated on and passed to the function as dictionaries, row-by-row or by batches of rows.
    This iterative process is accelerated by the use of concurrent threads and is tracked with a progress bar.
    Errors are catched if they match the `exceptions` parameter and automatically logged.
    Once the whole DataFrame has been iterated on, results and errors are added as additional columns.

    Args:
        input_df: Input dataframe which will be iterated on
        function: Function taking a dict as input and returning a dict
            If `function_support_batch` then the function works on list of dict
            For instance, a function to call an API or do some enrichment
        exceptions: Tuple of Exception classes to catch
        column_prefix: Column prefix to add to the output columns for the `function` responses and errors
        parallel_workers: Number of concurrent threads
        batch_support: If True, send batches of row to the `function`
            Else (default) send rows as dict to the function
        batch_size: Number of rows to include in each batch
            Taken into account if `batch_support` is True
        error_handling: If ErrorHandling.LOG (default), log the error message as a warning
            and return the row with error keys.
            Else fail is there is any error.
        verbose: If True, log additional information on errors
            Else (default) log the error message and the error type
        **function_kwargs: Arbitrary keyword arguments passed to the `function`

    Returns:
        Input dataframe with additional columns:
        - response from the `function`
        - error message if any
        - error type if any

    """
    df_iterator = (index_series_pair[1].to_dict()
                   for index_series_pair in input_df.iterrows())
    len_iterator = len(input_df.index)
    start = perf_counter()
    if batch_support:
        logging.info(
            f"Applying function {function.__name__} in parallel to {len_iterator} row(s)"
            + f" using batch size of {batch_size}...")
        df_iterator = chunked(df_iterator, batch_size)
        len_iterator = math.ceil(len_iterator / batch_size)
    else:
        logging.info(
            f"Applying function {function.__name__} in parallel to {len_iterator} row(s)..."
        )
    column_names = build_unique_column_names(input_df.columns, column_prefix)
    pool_kwargs = {
        **{
            "function": function,
            "error_handling": error_handling,
            "exceptions": exceptions,
            "column_names": column_names,
        },
        **function_kwargs.copy(),
    }
    for kwarg in ["fn", "row", "batch"]:  # Reserved pool keyword arguments
        pool_kwargs.pop(kwarg, None)
    if not batch_support and "batch_response_parser" in pool_kwargs.keys():
        pool_kwargs.pop("batch_response_parser", None)
    results = []
    with ThreadPoolExecutor(max_workers=parallel_workers) as pool:
        if batch_support:
            futures = [
                pool.submit(apply_function_to_batch,
                            batch=batch,
                            **pool_kwargs) for batch in df_iterator
            ]
        else:
            futures = [
                pool.submit(apply_function_to_row, row=row, **pool_kwargs)
                for row in df_iterator
            ]
        for future in tqdm_auto(as_completed(futures), total=len_iterator):
            results.append(future.result())
    if batch_support:
        results = flatten(results)
    output_df = convert_results_to_df(input_df, results, column_names,
                                      error_handling, verbose)
    num_error = sum(output_df[column_names.response] == "")
    num_success = len(input_df.index) - num_error
    logging.info((
        f"Applying function in parallel: {num_success} row(s) succeeded, {num_error} failed "
        f"in {(perf_counter() - start):.2f} seconds."))
    return output_df