Exemplo n.º 1
0
def _read_text_file(
    path: str,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: Union[boto3.Session, Dict[str, Optional[str]]],
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
) -> pd.DataFrame:
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=134_217_728,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,  # 128 MB (128 * 2**20)
    )
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    with _utils.open_file(fs=fs,
                          path=path,
                          mode=mode,
                          encoding=encoding,
                          newline=newline) as f:
        df: pd.DataFrame = parser_func(f, **pandas_kwargs)
    return _apply_partitions(df=df,
                             dataset=dataset,
                             path=path,
                             path_root=path_root)
Exemplo n.º 2
0
def _read_parquet_row_group(
    row_group: int,
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_primitives: _utils.Boto3PrimitivesType,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> pa.Table:
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=134_217_728,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,  # 128 MB (128 * 2**20)
    )
    with _utils.open_file(fs=fs, path=path, mode="rb") as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f, read_dictionary=categories)
        num_row_groups: int = pq_file.num_row_groups
        _logger.debug("Reading Row Group %s/%s [multi-threaded]",
                      row_group + 1, num_row_groups)
        return pq_file.read_row_group(i=row_group,
                                      columns=columns,
                                      use_threads=False,
                                      use_pandas_metadata=False)
Exemplo n.º 3
0
def _read_text_chunked(
    paths: List[str],
    chunksize: int,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: boto3.Session,
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
) -> Iterator[pd.DataFrame]:
    for path in paths:
        _logger.debug("path: %s", path)
        fs: s3fs.S3FileSystem = _utils.get_fs(
            s3fs_block_size=8_388_608,
            session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,  # 8 MB (8 * 2**20)
        )
        mode, encoding, newline = _get_read_details(
            path=path, pandas_kwargs=pandas_kwargs)
        with _utils.open_file(fs=fs,
                              path=path,
                              mode=mode,
                              encoding=encoding,
                              newline=newline) as f:
            reader: pandas.io.parsers.TextFileReader = parser_func(
                f, chunksize=chunksize, **pandas_kwargs)
            for df in reader:
                yield _apply_partitions(df=df,
                                        dataset=dataset,
                                        path=path,
                                        path_root=path_root)
Exemplo n.º 4
0
def _to_text(
    file_format: str,
    df: pd.DataFrame,
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    path: Optional[str] = None,
    path_root: Optional[str] = None,
    **pandas_kwargs,
) -> str:
    if df.empty is True:
        raise exceptions.EmptyDataFrame()
    if path is None and path_root is not None:
        file_path: str = f"{path_root}{uuid.uuid4().hex}.{file_format}"
    elif path is not None and path_root is None:
        file_path = path
    else:
        raise RuntimeError("path and path_root received at the same time.")
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=33_554_432,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,  # 32 MB (32 * 2**20)
    )
    encoding: Optional[str] = pandas_kwargs.get("encoding", None)
    newline: Optional[str] = pandas_kwargs.get("line_terminator", None)
    with _utils.open_file(fs=fs,
                          path=file_path,
                          mode="w",
                          encoding=encoding,
                          newline=newline) as f:
        _logger.debug("pandas_kwargs: %s", pandas_kwargs)
        if file_format == "csv":
            df.to_csv(f, **pandas_kwargs)
        elif file_format == "json":
            df.to_json(f, **pandas_kwargs)
    return file_path
Exemplo n.º 5
0
def _read_parquet_metadata_file(
    path: str,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> Dict[str, str]:
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=4_194_304,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs  # 4 MB (4 * 2**20)
    )
    with _utils.open_file(fs=fs, path=path, mode="rb") as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f)
        return _data_types.athena_types_from_pyarrow_schema(
            schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
Exemplo n.º 6
0
def _count_row_groups(
    path: str,
    categories: Optional[List[str]],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> int:
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=4_194_304,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs  # 4 MB (4 * 2**20)
    )
    with _utils.open_file(fs=fs, path=path, mode="rb") as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f, read_dictionary=categories)
        return pq_file.num_row_groups
Exemplo n.º 7
0
def _read_parquet_file(
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> pa.Table:
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=134_217_728,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,  # 128 MB (128 * 2**20)
    )
    with _utils.open_file(fs=fs, path=path, mode="rb") as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f, read_dictionary=categories)
        return pq_file.read(columns=columns,
                            use_threads=False,
                            use_pandas_metadata=False)
Exemplo n.º 8
0
def _read_parquet_chunked(
    paths: List[str],
    chunked: Union[bool, int],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    validate_schema: bool,
    safe: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=8_388_608,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs  # 8 MB (8 * 2**20)
    )
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with _utils.open_file(fs=fs, path=path, mode="rb") as f:
            pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
                source=f, read_dictionary=categories)
            schema: Dict[str,
                         str] = _data_types.athena_types_from_pyarrow_schema(
                             schema=pq_file.schema.to_arrow_schema(),
                             partitions=None)[0]
            if validate_schema is True and last_schema is not None:
                if schema != last_schema:
                    raise exceptions.InvalidSchemaConvergence(
                        f"Was detect at least 2 different schemas:\n"
                        f"    - {last_path} -> {last_schema}\n"
                        f"    - {path} -> {schema}")
            last_schema = schema
            last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            for i in range(num_row_groups):
                _logger.debug("Reading Row Group %s...", i)
                df: pd.DataFrame = _arrowtable2df(
                    table=pq_file.read_row_group(i=i,
                                                 columns=columns,
                                                 use_threads=use_threads,
                                                 use_pandas_metadata=False),
                    categories=categories,
                    safe=safe,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = pd.concat(objs=[next_slice, df],
                                       ignore_index=True,
                                       sort=False,
                                       copy=False)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked]
                        df = df.iloc[chunked:]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")
    if next_slice is not None:
        yield next_slice