def _paginate_stream( args: Dict[str, Any], path: str, use_threads: Union[bool, int], boto3_session: Optional[boto3.Session] ) -> pd.DataFrame: obj_size: int = size_objects( # type: ignore path=[path], use_threads=False, boto3_session=boto3_session, ).get(path) if obj_size is None: raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}") scan_ranges = _gen_scan_range(obj_size=obj_size) if use_threads is False: stream_records = list( _select_object_content( args=args, boto3_session=boto3_session, scan_range=scan_range, ) for scan_range in scan_ranges ) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: stream_records = list( executor.map( _select_object_content, itertools.repeat(args), itertools.repeat(boto3_session), scan_ranges, ) ) return pd.DataFrame([item for sublist in stream_records for item in sublist]) # Flatten list of lists
def _paginate_stream(args: Dict[str, Any], path: str, use_threads: Union[bool, int], boto3_session: Optional[boto3.Session]) -> pd.DataFrame: obj_size: int = size_objects( # type: ignore path=[path], use_threads=False, boto3_session=boto3_session, ).get(path) if obj_size is None: raise exceptions.InvalidArgumentValue( f"S3 object w/o defined size: {path}") dfs: List[pd.Dataframe] = [] client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) if use_threads is False: dfs = list( _select_object_content( args=args, client_s3=client_s3, scan_range=scan_range, ) for scan_range in _gen_scan_range(obj_size=obj_size)) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: dfs = list( executor.map( _select_object_content, itertools.repeat(args), itertools.repeat(client_s3), _gen_scan_range(obj_size=obj_size), )) return pd.concat(dfs, ignore_index=True)
def __init__( self, path: str, s3_block_size: int, mode: str, use_threads: Union[bool, int], s3_additional_kwargs: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], newline: Optional[str], encoding: Optional[str], ) -> None: super().__init__() self._use_threads = use_threads self._newline: str = "\n" if newline is None else newline self._encoding: str = "utf-8" if encoding is None else encoding self._bucket, self._key = _utils.parse_path(path=path) self._boto3_session: boto3.Session = _utils.ensure_session(session=boto3_session) if mode not in {"rb", "wb", "r", "w"}: raise NotImplementedError("File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode) self._mode: str = "rb" if mode is None else mode self._one_shot_download: bool = False if 0 < s3_block_size < 3: raise exceptions.InvalidArgumentValue( "s3_block_size MUST > 2 to define a valid size or " "< 1 to avoid blocks and always execute one shot downloads." ) if s3_block_size <= 0: _logger.debug("s3_block_size of %d, enabling one_shot_download.", s3_block_size) self._one_shot_download = True self._s3_block_size: int = s3_block_size self._s3_half_block_size: int = s3_block_size // 2 self._s3_additional_kwargs: Dict[str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session) self._loc: int = 0 if self.readable() is True: self._cache: bytes = b"" self._start: int = 0 self._end: int = 0 size: Optional[int] = size_objects( path=[path], use_threads=False, boto3_session=self._boto3_session, s3_additional_kwargs=self._s3_additional_kwargs, )[path] if size is None: raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}") self._size: int = size _logger.debug("self._size: %s", self._size) _logger.debug("self._s3_block_size: %s", self._s3_block_size) elif self.writable() is True: self._mpu: Dict[str, Any] = {} self._buffer: io.BytesIO = io.BytesIO() self._parts_count: int = 0 self._size = 0 self._upload_proxy: _UploadProxy = _UploadProxy(use_threads=self._use_threads) else: raise RuntimeError(f"Invalid mode: {self._mode}")