Пример #1
0
def _read_dfs_from_multiple_paths(
    read_func: Callable[..., pd.DataFrame],
    paths: List[str],
    version_ids: Optional[Dict[str, str]],
    use_threads: Union[bool, int],
    kwargs: Dict[str, Any],
) -> List[pd.DataFrame]:
    cpus = ensure_cpu_count(use_threads)
    if cpus < 2:
        return [
            read_func(
                path,
                version_id=version_ids.get(path) if version_ids else None,
                **kwargs) for path in paths
        ]

    with concurrent.futures.ThreadPoolExecutor(
            max_workers=ensure_cpu_count(use_threads)) as executor:
        kwargs["boto3_session"] = boto3_to_primitives(kwargs["boto3_session"])
        partial_read_func = partial(read_func, **kwargs)
        versions = [
            version_ids.get(p) if isinstance(version_ids, dict) else None
            for p in paths
        ]
        return list(df
                    for df in executor.map(partial_read_func, paths, versions))
Пример #2
0
def _read_schemas_from_files(
    paths: List[str],
    sampling: float,
    use_threads: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> Tuple[Dict[str, str], ...]:
    paths = _utils.list_sampling(lst=paths, sampling=sampling)
    schemas: Tuple[Dict[str, str], ...] = tuple()
    n_paths: int = len(paths)
    if use_threads is False or n_paths == 1:
        schemas = tuple(
            _read_parquet_metadata_file(
                path=p,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
                use_threads=use_threads) for p in paths)
    elif n_paths > 1:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            schemas = tuple(
                executor.map(
                    _read_parquet_metadata_file,
                    paths,
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=boto3_session)),  # Boto3.Session
                    itertools.repeat(s3_additional_kwargs),
                    itertools.repeat(use_threads),
                ))
    _logger.debug("schemas: %s", schemas)
    return schemas
def _read_parquet(
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    safe: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> pd.DataFrame:
    if use_threads is False:
        table: pa.Table = _read_parquet_file(
            path=path,
            columns=columns,
            categories=categories,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
            use_threads=use_threads,
        )
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        num_row_groups: int = _count_row_groups(
            path=path,
            categories=categories,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
            use_threads=use_threads,
        )
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            tables: Tuple[pa.Table, ...] = tuple(
                executor.map(
                    _read_parquet_row_group,
                    range(num_row_groups),
                    itertools.repeat(path),
                    itertools.repeat(columns),
                    itertools.repeat(categories),
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=boto3_session)),
                    itertools.repeat(s3_additional_kwargs),
                    itertools.repeat(use_threads),
                ))
            table = pa.lib.concat_tables(tables, promote=False)
    _logger.debug("Converting PyArrow Table to Pandas DataFrame...")
    return _arrowtable2df(
        table=table,
        categories=categories,
        safe=safe,
        use_threads=use_threads,
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
Пример #4
0
def _read_dfs_from_multiple_paths(
        read_func: Callable[..., pd.DataFrame], paths: List[str],
        use_threads: Union[bool,
                           int], kwargs: Dict[str, Any]) -> List[pd.DataFrame]:
    cpus = ensure_cpu_count(use_threads)
    if cpus < 2:
        return [read_func(path, **kwargs) for path in paths]

    with concurrent.futures.ThreadPoolExecutor(
            max_workers=ensure_cpu_count(use_threads)) as executor:
        kwargs["boto3_session"] = boto3_to_primitives(kwargs["boto3_session"])
        partial_read_func = partial(read_func, **kwargs)
        return list(df for df in executor.map(partial_read_func, paths))
Пример #5
0
 def upload(
     self,
     bucket: str,
     key: str,
     part: int,
     upload_id: str,
     data: bytes,
     boto3_session: boto3.Session,
     boto3_kwargs: Dict[str, Any],
 ) -> None:
     """Upload Part."""
     if self._exec is not None:
         _utils.block_waiting_available_thread(seq=self._futures,
                                               max_workers=self._cpus)
         future = self._exec.submit(
             _UploadProxy._caller,
             bucket=bucket,
             key=key,
             part=part,
             upload_id=upload_id,
             data=data,
             boto3_primitives=_utils.boto3_to_primitives(
                 boto3_session=boto3_session),
             boto3_kwargs=boto3_kwargs,
         )
         self._futures.append(future)
     else:
         self._results.append(
             self._caller(
                 bucket=bucket,
                 key=key,
                 part=part,
                 upload_id=upload_id,
                 data=data,
                 boto3_primitives=_utils.boto3_to_primitives(
                     boto3_session=boto3_session),
                 boto3_kwargs=boto3_kwargs,
             ))
 def write(self, func: Callable[..., List[str]], boto3_session: boto3.Session, **func_kwargs: Any) -> None:
     """Write File."""
     if self._exec is not None:
         _utils.block_waiting_available_thread(seq=self._futures, max_workers=self._cpus)
         _logger.debug("Submitting: %s", func)
         future = self._exec.submit(
             _WriteProxy._caller,
             func=func,
             boto3_primitives=_utils.boto3_to_primitives(boto3_session=boto3_session),
             func_kwargs=func_kwargs,
         )
         self._futures.append(future)
     else:
         self._results += func(boto3_session=boto3_session, **func_kwargs)
 def write(self, func: Callable, boto3_session: boto3.Session,
           **func_kwargs) -> None:
     """Write File."""
     if self._exec is not None:
         _logger.debug("Submitting: %s", func)
         future = self._exec.submit(
             fn=_WriteProxy._caller,
             func=func,
             boto3_primitives=_utils.boto3_to_primitives(
                 boto3_session=boto3_session),
             func_kwargs=func_kwargs,
         )
         self._futures.append(future)
     else:
         self._results.append(
             func(boto3_session=boto3_session, **func_kwargs))
Пример #8
0
def _read_schemas_from_files(
    paths: List[str],
    sampling: float,
    use_threads: Union[bool, int],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    version_ids: Optional[Dict[str, str]] = None,
    ignore_null: bool = False,
    pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[Dict[str, str], ...]:

    paths = _utils.list_sampling(lst=paths, sampling=sampling)
    schemas: Tuple[Optional[Dict[str, str]], ...] = tuple()
    n_paths: int = len(paths)
    cpus: int = _utils.ensure_cpu_count(use_threads)
    if cpus == 1 or n_paths == 1:
        schemas = tuple(
            _read_parquet_metadata_file(
                path=p,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
                use_threads=use_threads,
                version_id=version_ids.get(p) if isinstance(version_ids, dict) else None,
                ignore_null=ignore_null,
                pyarrow_additional_kwargs=pyarrow_additional_kwargs,
            )
            for p in paths
        )
    elif n_paths > 1:
        versions = [version_ids.get(p) if isinstance(version_ids, dict) else None for p in paths]
        with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
            schemas = tuple(
                executor.map(
                    _read_parquet_metadata_file,
                    paths,
                    itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)),  # Boto3.Session
                    itertools.repeat(s3_additional_kwargs),
                    itertools.repeat(use_threads),
                    versions,
                    itertools.repeat(ignore_null),
                    itertools.repeat(pyarrow_additional_kwargs),
                )
            )
    schemas = cast(Tuple[Dict[str, str], ...], tuple(x for x in schemas if x is not None))
    _logger.debug("schemas: %s", schemas)
    return schemas
Пример #9
0
def _wait_objects(
    waiter_name: str,
    paths: List[str],
    delay: Optional[Union[int, float]] = None,
    max_attempts: Optional[int] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    delay = 5 if delay is None else delay
    max_attempts = 20 if max_attempts is None else max_attempts
    _delay: int = int(delay) if isinstance(delay, float) else delay
    if len(paths) < 1:
        return None
    _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths]
    if len(_paths) == 1:
        _wait_object(
            path=_paths[0],
            waiter_name=waiter_name,
            delay=_delay,
            max_attempts=max_attempts,
            boto3_session=boto3_session,
        )
    elif use_threads is False:
        for path in _paths:
            _wait_object(path=path,
                         waiter_name=waiter_name,
                         delay=_delay,
                         max_attempts=max_attempts,
                         boto3_session=boto3_session)
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            list(
                executor.map(
                    _wait_object_concurrent,
                    _paths,
                    itertools.repeat(waiter_name),
                    itertools.repeat(_delay),
                    itertools.repeat(max_attempts),
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=boto3_session)),
                ))
    return None
Пример #10
0
def _read_concurrent(
    func: Callable[..., pd.DataFrame],
    paths: List[str],
    ignore_index: Optional[bool],
    boto3_session: boto3.Session,
    **func_kwargs: Any,
) -> pd.DataFrame:
    cpus: int = _utils.ensure_cpu_count(use_threads=True)
    with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
        return _union(
            dfs=list(
                executor.map(
                    _caller,
                    paths,
                    itertools.repeat(func),
                    itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)),
                    itertools.repeat(func_kwargs),
                )
            ),
            ignore_index=ignore_index,
        )
Пример #11
0
 def _fetch_range_proxy(self, start: int, end: int) -> bytes:
     _logger.debug("Fetching: s3://%s/%s - Range: %s-%s", self._bucket,
                   self._key, start, end)
     boto3_primitives: _utils.Boto3PrimitivesType = _utils.boto3_to_primitives(
         boto3_session=self._boto3_session)
     boto3_kwargs: Dict[str, Any] = get_botocore_valid_kwargs(
         function_name="get_object",
         s3_additional_kwargs=self._s3_additional_kwargs)
     cpus: int = _utils.ensure_cpu_count(use_threads=self._use_threads)
     range_size: int = end - start
     if cpus < 2 or range_size < (2 * _MIN_PARALLEL_READ_BLOCK):
         return _fetch_range(
             range_values=(start, end),
             bucket=self._bucket,
             key=self._key,
             boto3_primitives=boto3_primitives,
             boto3_kwargs=boto3_kwargs,
         )[1]
     sizes: Tuple[int, ...] = _utils.get_even_chunks_sizes(
         total_size=range_size,
         chunk_size=_MIN_PARALLEL_READ_BLOCK,
         upper_bound=False)
     ranges: List[Tuple[int, int]] = []
     chunk_start: int = start
     for size in sizes:
         ranges.append((chunk_start, chunk_start + size))
         chunk_start += size
     with concurrent.futures.ThreadPoolExecutor(
             max_workers=cpus) as executor:
         return self._merge_range(ranges=list(
             executor.map(
                 _fetch_range,
                 ranges,
                 itertools.repeat(self._bucket),
                 itertools.repeat(self._key),
                 itertools.repeat(boto3_primitives),
                 itertools.repeat(boto3_kwargs),
             )), )
Пример #12
0
def write(
    df: pd.DataFrame,
    database: str,
    table: str,
    time_col: str,
    measure_col: str,
    dimensions_cols: List[str],
    num_threads: int = 32,
    boto3_session: Optional[boto3.Session] = None,
) -> List[Dict[str, str]]:
    """Store a Pandas DataFrame into a Amazon Timestream table.

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    database : str
        Amazon Timestream database name.
    table : str
        Amazon Timestream table name.
    time_col : str
        DataFrame column name to be used as time. MUST be a timestamp column.
    measure_col : str
        DataFrame column name to be used as measure.
    dimensions_cols : List[str]
        List of DataFrame column names to be used as dimensions.
    num_threads : str
        Number of thread to be used for concurrent writing.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.

    Returns
    -------
    List[Dict[str, str]]
        Rejected records.

    Examples
    --------
    Store a Pandas DataFrame into a Amazon Timestream table.

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> df = pd.DataFrame(
    >>>     {
    >>>         "time": [datetime.now(), datetime.now(), datetime.now()],
    >>>         "dim0": ["foo", "boo", "bar"],
    >>>         "dim1": [1, 2, 3],
    >>>         "measure": [1.0, 1.1, 1.2],
    >>>     }
    >>> )
    >>> rejected_records = wr.timestream.write(
    >>>     df=df,
    >>>     database="sampleDB",
    >>>     table="sampleTable",
    >>>     time_col="time",
    >>>     measure_col="measure",
    >>>     dimensions_cols=["dim0", "dim1"],
    >>> )
    >>> assert len(rejected_records) == 0

    """
    measure_type: str = _data_types.timestream_type_from_pandas(
        df[[measure_col]])
    _logger.debug("measure_type: %s", measure_type)
    cols_names: List[str] = [time_col, measure_col] + dimensions_cols
    _logger.debug("cols_names: %s", cols_names)
    batches: List[List[Any]] = _utils.chunkify(lst=_df2list(df=df[cols_names]),
                                               max_length=100)
    _logger.debug("len(batches): %s", len(batches))
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=num_threads) as executor:
        res: List[List[Any]] = list(
            executor.map(
                _write_batch,
                itertools.repeat(database),
                itertools.repeat(table),
                itertools.repeat(cols_names),
                itertools.repeat(measure_type),
                batches,
                itertools.repeat(
                    _utils.boto3_to_primitives(boto3_session=boto3_session)),
            ))
        return [item for sublist in res for item in sublist]
Пример #13
0
def describe_objects(
    path: Union[str, List[str]],
    use_threads: bool = True,
    last_modified_begin: Optional[datetime.datetime] = None,
    last_modified_end: Optional[datetime.datetime] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, Dict[str, Any]]:
    """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths.

    Fetch attributes like ContentLength, DeleteMarker, last_modified, ContentType, etc
    The full list of attributes can be explored under the boto3 head_object documentation:
    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object

    This function accepts Unix shell-style wildcards in the path argument.
    * (matches everything), ? (matches any single character),
    [seq] (matches any character in seq), [!seq] (matches any character not in seq).

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Note
    ----
    The filter by last_modified begin last_modified end is applied after list all S3 files

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (accepts Unix shell-style wildcards)
        (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    last_modified_begin
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    last_modified_end: datetime, optional
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Dict[str, Any]]
        Return a dictionary of objects returned from head_objects where the key is the object path.
        The response object can be explored here:
        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object

    Examples
    --------
    >>> import awswrangler as wr
    >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1'])  # Describe both objects
    >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix')  # Describe all objects under the prefix

    """
    paths: List[str] = _path2list(
        path=path,
        boto3_session=boto3_session,
        last_modified_begin=last_modified_begin,
        last_modified_end=last_modified_end,
    )
    if len(paths) < 1:
        return {}
    resp_list: List[Tuple[str, Dict[str, Any]]]
    if len(paths) == 1:
        resp_list = [
            _describe_object(path=paths[0], boto3_session=boto3_session)
        ]
    elif use_threads is False:
        resp_list = [
            _describe_object(path=p, boto3_session=boto3_session)
            for p in paths
        ]
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            resp_list = list(
                executor.map(
                    _describe_object_concurrent,
                    paths,
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=boto3_session)),
                ))
    desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list)
    return desc_dict
Пример #14
0
def delete_objects(
    path: Union[str, List[str]],
    use_threads: bool = True,
    last_modified_begin: Optional[datetime.datetime] = None,
    last_modified_end: Optional[datetime.datetime] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths.

    This function accepts Unix shell-style wildcards in the path argument.
    * (matches everything), ? (matches any single character),
    [seq] (matches any character in seq), [!seq] (matches any character not in seq).

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Note
    ----
    The filter by last_modified begin last_modified end is applied after list all S3 files

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (accepts Unix shell-style wildcards)
        (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    last_modified_begin
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    last_modified_end: datetime, optional
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1'])  # Delete both objects
    >>> wr.s3.delete_objects('s3://bucket/prefix')  # Delete all objects under the received prefix

    """
    paths: List[str] = _path2list(
        path=path,
        boto3_session=boto3_session,
        last_modified_begin=last_modified_begin,
        last_modified_end=last_modified_end,
    )
    if len(paths) < 1:
        return
    buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths)
    for bucket, keys in buckets.items():
        chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000)
        if len(chunks) == 1:
            _delete_objects(bucket=bucket,
                            keys=chunks[0],
                            boto3_session=boto3_session)
        elif use_threads is False:
            for chunk in chunks:
                _delete_objects(bucket=bucket,
                                keys=chunk,
                                boto3_session=boto3_session)
        else:
            cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=cpus) as executor:
                list(
                    executor.map(
                        _delete_objects_concurrent,
                        itertools.repeat(bucket),
                        chunks,
                        itertools.repeat(
                            _utils.boto3_to_primitives(
                                boto3_session=boto3_session)),
                    ))
Пример #15
0
def _read_text(
    parser_func: Callable,
    path: Union[str, List[str]],
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, str]] = None,
    chunksize: Optional[int] = None,
    dataset: bool = False,
    **pandas_kwargs,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    if "iterator" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "Please, use chunksize instead of iterator.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if (dataset is True) and (not isinstance(path, str)):  # pragma: no cover
        raise exceptions.InvalidArgument(
            "The path argument must be a string Amazon S3 prefix if dataset=True."
        )
    if dataset is True:
        path_root: str = str(path)
    else:
        path_root = ""
    paths: List[str] = path2list(path=path, boto3_session=session)
    _logger.debug("paths:\n%s", paths)
    if chunksize is not None:
        dfs: Iterator[pd.DataFrame] = _read_text_chunksize(
            parser_func=parser_func,
            paths=paths,
            boto3_session=session,
            chunksize=chunksize,
            pandas_args=pandas_kwargs,
            s3_additional_kwargs=s3_additional_kwargs,
            dataset=dataset,
            path_root=path_root,
        )
        return dfs
    if use_threads is False:
        df: pd.DataFrame = pd.concat(
            objs=[
                _read_text_full(
                    parser_func=parser_func,
                    path=p,
                    boto3_session=session,
                    pandas_args=pandas_kwargs,
                    s3_additional_kwargs=s3_additional_kwargs,
                    dataset=dataset,
                    path_root=path_root,
                ) for p in paths
            ],
            ignore_index=True,
            sort=False,
        )
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            df = pd.concat(
                objs=executor.map(
                    _read_text_full,
                    itertools.repeat(parser_func),
                    itertools.repeat(path_root),
                    paths,
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=session)),  # Boto3.Session
                    itertools.repeat(pandas_kwargs),
                    itertools.repeat(s3_additional_kwargs),
                    itertools.repeat(dataset),
                ),
                ignore_index=True,
                sort=False,
            )
    return df